gguf.py : indentation

Merge branch 'gguf' into gguf-write-single-pass
gguf.py : fix vertical alignment
2023-08-17 21:53:15 +03:00 · 2023-08-17 21:51:15 +03:00 · 2023-08-17 21:50:01 +03:00 · 2023-08-17 21:40:51 +03:00 · 2023-08-17 21:32:14 +03:00 · 2023-08-17 20:27:50 +02:00
40 changed files with 7626 additions and 2672 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.o
 *.a
 *.so
+*.gguf
 *.bin
 .DS_Store
 .build/
@ -47,6 +48,8 @@ models-mnt
 /server
 /Pipfile
 /embd-input-test
+/gguf
+/gguf-llama-simple
 /libllama.so
 build-info.h
 arm_neon.h
@ -64,7 +67,6 @@ perf-*.txt

 examples/jeopardy/results.txt

-
 pyproject.toml
 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -527,7 +527,6 @@ endif()
 add_library(llama
            llama.cpp
            llama.h
-            llama-util.h
            )

 target_include_directories(llama PUBLIC .)
--- a/9
+++ b/9
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf

 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@ -329,7 +329,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h

 OBJS += ggml-alloc.o

-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 common.o: examples/common.cpp examples/common.h
@ -345,7 +345,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf build-info.h $(TEST_TARGETS)

 #
 # Examples
@ -385,6 +385,9 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput

+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

--- a/README.md
+++ b/README.md
@ -284,7 +284,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
 Any value larger than 0 will offload the computation to the GPU. For example:

 ```bash
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
 ```

 ### MPI Build
@ -323,7 +323,7 @@ The above will distribute the computation across 2 processes on the first host a
 Finally, you're ready to run a computation using `mpirun`:

 ```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 ### BLAS Build
@ -506,10 +506,10 @@ python3 convert.py models/7B/
  python convert.py models/7B/ --vocabtype bpe

 # quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0

 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -565,7 +565,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh

 # custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```

 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@ -628,6 +628,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It

 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)

+*Note: these instructions are likely obsoleted by the GGUF update*
+
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@ -703,7 +705,7 @@ If your issue is with model generation quality, then please at least scan the fo
 #### How to run

 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
@ -802,13 +804,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 ### Docker With CUDA
@ -839,8 +841,8 @@ The resulting images, are essentially the same as the non-CUDA images:
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.

 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 ```

 ### Contributing
--- a/ci/run.sh
+++ b/ci/run.sh
@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {

    python3 ../convert.py ${path_models}

-    model_f16="${path_models}/ggml-model-f16.bin"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"

    wiki_test_60="${path_wiki}/wiki.test-60.raw"

@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {

    python3 ../convert.py ${path_models}

-    model_f16="${path_models}/ggml-model-f16.bin"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"

    wiki_test="${path_wiki}/wiki.test.raw"

--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@ -0,0 +1,268 @@
+# HF gptneox--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTNeoXForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+llm_arch = "gptneox"
+gguf_writer = gguf.GGUFWriter(fname_out, arch=llm_arch)
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+
+gguf_writer.add_architecture()
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(hparams["max_position_embeddings"])
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: get gpt2 tokenizer merges")
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]
+
+    gguf_writer.add_token_merges(merges)
+
+    print("gguf: get gpt2 tokenizer vocab")
+
+    vocab_size = len(tokenizer_json["model"]["vocab"])
+
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
+            continue
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+        old_dtype = data_dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data.dtype == np.float16:
+            data_dtype = np.float32
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data.dtype == np.float16 and n_dims == 1:
+            data_dtype = np.float32
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data_dtype = np.float16
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data_dtype))
+
+        data = data.astype(data_dtype)
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@ -0,0 +1,265 @@
+# 7b pth llama --> gguf conversion, GQA/70b not supported
+# Only models with a single datafile are supported, like 7B
+# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("consolidated."):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+if num_parts > 1:
+    print("gguf: Only models with a single datafile are supported.")
+
+    sys.exit()
+llm_arch = "llama"
+gguf_writer = gguf.GGUFWriter(fname_out, arch=llm_arch)
+
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+gguf_writer.add_architecture()
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(hparams["max_position_embeddings"])
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab and scores")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # TODO: How to determinate if a token is user defined?
+        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+        # if tokenizer.is_user_defined(i): toktype = 4
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name == "rope.freqs":
+            continue
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+        old_dtype = data_dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data.dtype == np.float16:
+            data_dtype = np.float32
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data_dtype = np.float32
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data_dtype = np.float16
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data_dtype))
+
+        data = data.astype(data_dtype)
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@ -0,0 +1,294 @@
+# HF llama --> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List, Optional
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+gguf_writer = gguf.GGUFWriter(fname_out, arch="llama")
+
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_architecture()
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # TODO: How to determinate if a token is user defined?
+        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+        # if tokenizer.is_user_defined(i): toktype = 4
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".rotary_emb.inv_freq"):
+            continue
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # reverse permute these
+        if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
+            data = reverse_hf_permute(data, head_count, head_count_kv)
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+        old_dtype = data_dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data.dtype == np.float16:
+            data_dtype = np.float32
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data_dtype = np.float32
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data_dtype = np.float16
+
+        data = data.astype(data_dtype)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert.py
+++ b/convert.py
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with cuBLAS
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```

 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
 CPU: 7 physical cores
 RAM: 32GB

-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)

-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`

 Result:

--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "-gqa" || arg == "--gqa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_gqa = std::stoi(argv[i]);
-        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -561,8 +549,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@ -650,24 +636,11 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    return "The";
 }

-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
-    lparams.n_gqa           = params.n_gqa;
-    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
--- a/examples/common.h
+++ b/examples/common.h
@ -2,6 +2,7 @@

 #pragma once

+#define LLAMA_API_CPP // TODO: eliminate me
 #include "llama.h"

 #include <string>
@ -22,14 +23,12 @@ struct gpt_params {
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
-    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor

@ -53,7 +52,7 @@ struct gpt_params {
    std::string cfg_negative_prompt;       // string to help guidance
    float       cfg_scale         = 1.f;   // How strong is guidance

-    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
@ -100,12 +99,6 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);

-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
 //
 // Model utils
 //
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
+
 #include <unordered_map>
 #include <vector>
 #include <cassert>
@ -502,7 +503,7 @@ bool is_ggml_file(const char *filename) {
        return false;
    }
    uint32_t magic = file.read_u32();
-    return magic == LLAMA_FILE_MAGIC;
+    return magic == GGUF_MAGIC;
 }

 void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
@ -590,75 +591,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
    if (file.fp == NULL) {
        return;
    }
-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);

-    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
-
-    // stuff AK weights into GG weights one by one.
-    // w->token_embedding_table -> model->tok_embeddings
-    // float*                   -> struct ggml_tensor
-    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
-    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
-
-    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
-    //print_row(model->norm, 0);
-
-    // for rms-att-weight
-    int row_length = model->hparams.n_embd;
-    const auto & hparams = model->hparams;
-    //int n_ff = model->hparams.n_embd;
-    int n_ff = get_n_ff(&hparams);
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
-        auto & layer = model->layers[i];
-        // 1d
-        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-
-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
-    }
-    // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output); // ?
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
+#pragma message("TODO: implement file saving using gguf")
+    (void) vocab;
+    (void) model;
+    (void) w;
+//    // write_magic
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+//    file.write_u32(LLAMA_FILE_VERSION); // version
+//    // write_hparams
+//    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(model->hparams.n_mult);
+//    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//
+//    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+//    uint32_t n_vocab = model->hparams.n_vocab;
+//    for (uint32_t i = 0; i < n_vocab; i++) {
+//        const auto & token_score = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_score.tok.size());
+//        file.write_raw(token_score.tok.data(), token_score.tok.size());
+//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//    }
+//
+//    // stuff AK weights into GG weights one by one.
+//    // w->token_embedding_table -> model->tok_embeddings
+//    // float*                   -> struct ggml_tensor
+//    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+//    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+//
+//    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+//    //print_row(model->norm, 0);
+//
+//    // for rms-att-weight
+//    int row_length = model->hparams.n_embd;
+//    const auto & hparams = model->hparams;
+//    //int n_ff = model->hparams.n_embd;
+//    int n_ff = get_n_ff(&hparams);
+//
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+//        auto & layer = model->layers[i];
+//        // 1d
+//        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+//
+//        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+//        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+//
+//        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+//        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+//    }
+//    // write tensors
+//    write_tensor(&file, model->tok_embeddings);
+//    write_tensor(&file, model->norm);
+//    write_tensor(&file, model->output); // ?
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        auto & layer = model->layers[i];
+//
+//        write_tensor(&file, layer.attention_norm);
+//        write_tensor(&file, layer.wq);
+//        write_tensor(&file, layer.wk);
+//        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.w3);
+//    }
 }

 struct train_params get_default_train_params() {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -0,0 +1,246 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+template<typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+bool gguf_ex_write(const std::string & fname) {
+    struct gguf_context * ctx = gguf_init_empty();
+
+    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
+    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
+    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
+    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
+    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
+    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
+    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
+    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
+    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
+
+    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
+    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
+    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 128ull*1024ull*1024ull,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_data = ggml_init(params);
+
+    const int n_tensors = 10;
+
+    // tensor infos
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = "tensor_" + to_string(i);
+
+        int64_t ne[GGML_MAX_DIMS] = { 1 };
+        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
+
+        for (int j = 0; j < n_dims; ++j) {
+            ne[j] = rand() % 10 + 1;
+        }
+
+        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
+        ggml_set_name(cur, name.c_str());
+
+        {
+            float * data = (float *) cur->data;
+            for (int j = 0; j < ggml_nelements(cur); ++j) {
+                data[j] = 100 + i;
+            }
+        }
+
+        gguf_add_tensor(ctx, cur);
+    }
+
+    gguf_write_to_file(ctx, fname.c_str(), false);
+
+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+// just read tensor info
+bool gguf_ex_read_0(const std::string & fname) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ NULL,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // find kv string
+    {
+        const char * findkey = "some.parameter.string";
+
+        const int keyidx = gguf_find_key(ctx, findkey);
+        if (keyidx == -1) {
+            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
+        } else {
+            const char * key_value = gguf_get_val_str(ctx, keyidx);
+            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    gguf_free(ctx);
+
+    return true;
+}
+
+// read and create ggml_context containing the tensors and their data
+bool gguf_ex_read_1(const std::string & fname) {
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    // data
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
+
+            const char * name = gguf_get_tensor_name(ctx, i);
+
+            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+
+            // print first 10 elements
+            const float * data = (const float *) cur->data;
+
+            printf("%s data[:10] : ", name);
+            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
+                printf("%f ", data[j]);
+            }
+            printf("\n\n");
+
+            // check data
+            {
+                const float * data = (const float *) cur->data;
+                for (int j = 0; j < ggml_nelements(cur); ++j) {
+                    if (data[j] != 100 + i) {
+                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3) {
+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
+        return -1;
+    }
+
+    const std::string fname(argv[1]);
+    const std::string mode (argv[2]);
+
+    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
+
+    if (mode == "w") {
+        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
+    } else if (mode == "r") {
+        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+    }
+
+    return 0;
+}
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -191,10 +191,6 @@ int main(int argc, char ** argv) {

    // tokenize the prompt
    std::vector<llama_token> embd_inp;
-
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
    } else {
@ -270,15 +266,12 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }

        if (ctx_guidance) {
@ -286,14 +279,14 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
            }
        }

        if (params.n_keep > 0) {
        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
            }
            fprintf(stderr, "'\n");
        }
@ -311,7 +304,7 @@ int main(int argc, char ** argv) {
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
-        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

        fprintf(stderr, "%s: interactive mode on.\n", __func__);
@ -662,7 +655,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
+                printf("%s", llama_token_to_str(ctx, id).c_str());
            }
            fflush(stdout);
        }
@ -782,8 +775,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);

-                        std::vector<const llama_grammar_element *> grammar_rules(
-                            parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@ -2,7 +2,7 @@
 //
 // - First, export a LLaMA graph:
 //
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
 //
 // - Run this tool to evaluate the exported graph:
 //
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "build-info.h"

+#define LLAMA_API_CPP // TODO: eliminate me
 #define LLAMA_API_INTERNAL
 #include "llama.h"

@ -24,7 +25,7 @@
 #endif

 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.bin";
+    std::string model = "models/7B/ggml-model-f16.gguf";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }

 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        // export as [inp path]/ggml-model-[ftype].gguf
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
    }
    else {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx     = params.n_ctx;
-    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
@ -45,9 +44,8 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = std::vector<llama_token>(params.n_ctx);
-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
-
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
+    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        llama_free(ctx);
@ -92,7 +90,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@ -152,7 +150,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
 Command line options:

 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@ -48,14 +48,12 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./server -m models/7B/ggml-model.bin -c 2048
+./server -m models/7B/ggml-model.gguf -c 2048
 ```

 ### Windows:

 ```powershell
-server.exe -m models\7B\ggml-model.bin -c 2048
-```

 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -652,8 +652,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -774,23 +772,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
-        else if (arg == "-gqa" || arg == "--gqa")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_gqa = std::stoi(argv[i]);
-        }
-        else if (arg == "-eps" || arg == "--rms-norm-eps") {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rms_norm_eps = std::stof(argv[i]);
-        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -2,180 +2,129 @@
 #define _GNU_SOURCE
 #endif

-#include "common.h"
-#include "llama.h"
 #include "build-info.h"

-#include <cassert>
-#include <cinttypes>
+#include "common.h"
+#include "llama.h"
+
 #include <cmath>
 #include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-#include <signal.h>
-#endif
-
-
-
-int main(int argc, char ** argv)
-{
+int main(int argc, char ** argv) {
    gpt_params params;

-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc == 1 || argv[1][0] == '-' )
-    {
-        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
        return 1 ;
    }

-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    if ( argc >= 2 )
-    {
+    if (argc >= 2) {
        params.model = argv[1];
    }

-    if ( argc >= 3 )
-    {
+    if (argc >= 3) {
        params.prompt = argv[2];
    }

-    if ( params.prompt.empty() )
-    {
+    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
+    // init LLM

    llama_backend_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
+    llama_context_params ctx_params = llama_context_default_params();

-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

-    if ( model == NULL )
-    {
-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int max_context_size     = llama_n_ctx( ctx );
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_tokens_list_size = max_context_size - 4;

-    if ( (int)tokens_list.size() > max_tokens_list_size )
-    {
-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
-             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+    if ((int) tokens_list.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }

-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");

-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        printf( "%s" , llama_token_to_str( ctx , id ) );
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
    }

-    fflush(stdout);
+    fflush(stderr);

-
-    //---------------------------------
-    // Main prediction loop :
-    //---------------------------------
+    // main loop

    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.

-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
-    {
-        //---------------------------------
-        // Evaluate the tokens :
-        //---------------------------------
+    const int n_gen = std::min(32, max_context_size);

-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
-        {
-            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
+        // evaluate the transformer
+
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }

        tokens_list.clear();

-        //---------------------------------
-        // Select the best prediction :
-        //---------------------------------
+        // sample the next token

        llama_token new_token_id = 0;

-        auto logits  = llama_get_logits( ctx );
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto logits  = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);

        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);

-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
-        {
-            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-        // Select it using the "Greedy sampling" method :
-        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
-
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);

        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
-        {
+        if (new_token_id == llama_token_eos()) {
            fprintf(stderr, " [end of text]\n");
            break;
        }

-        // Print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
-        fflush( stdout );
+        // print the new token :
+        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
+        fflush(stdout);

-        // Push this new token for next evaluation :
-        tokens_list.push_back( new_token_id );
+        // push this new token for next evaluation
+        tokens_list.push_back(new_token_id);
+    }

-    } // wend of main loop
-
-    llama_free( ctx );
-    llama_free_model( model );
+    llama_free(ctx);
+    llama_free_model(model);

    llama_backend_free();

+    fprintf(stderr, "\n\n");
+
    return 0;
 }
-
-// EOF
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "common.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@ -16,7 +17,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+static const float rms_norm_eps = 1e-5f;

 struct random_normal_distribution {
    std::mt19937 gen;
@ -1961,7 +1962,7 @@ void print_matrix(struct ggml_tensor * probs) {


 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
+    printf("%s", llama_token_to_str(ctx, token).c_str());
 }

 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@ -2188,11 +2189,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
    f.read_raw(buf.data(), f.size);
    buf[f.size] = '\0';

-    out.resize(buf.size());
-
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+    if (n_tokens < 0) {
+        out.resize(-n_tokens);
+        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
    }

    bool verify = false;
@ -2200,17 +2200,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
        const char * in  = buf.data();
        const char * end = buf.data() + buf.size();
        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
+            std::string s = llama_token_to_str(lctx, out[i]);
+            int len = s.length();
            if (in >= end) {
                printf("%s: unexpected end of original text.\n", __func__);
                break;
            }
-            const bool matches = (strncmp(in, s, len) == 0);
+            const bool matches = (strncmp(in, s.c_str(), len) == 0);
            if (matches) {
                in += len;
            } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
            }
        }
    }
@ -2612,42 +2612,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
        return;
    }

-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
-    // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
+#pragma message("TODO: implement file saving using gguf")
+    (void) vocab;
+    (void) model;
+//    // write_magic
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+//    file.write_u32(LLAMA_FILE_VERSION); // version
+//    // write_hparams
+//    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(model->hparams.n_mult);
+//    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//    // write_vocab
+//    uint32_t n_vocab = model->hparams.n_vocab;
+//    for (uint32_t i = 0; i < n_vocab; i++) {
+//        const auto & token_score = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_score.tok.size());
+//        file.write_raw(token_score.tok.data(), token_score.tok.size());
+//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//    }
+//    // write tensors
+//    write_tensor(&file, model->tok_embeddings);
+//    write_tensor(&file, model->norm);
+//    write_tensor(&file, model->output);
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        auto & layer = model->layers[i];
+//
+//        write_tensor(&file, layer.attention_norm);
+//        write_tensor(&file, layer.wq);
+//        write_tensor(&file, layer.wk);
+//        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.w3);
+//    }
 }

 float cosine_decay(const int decay_steps, const float alpha, int step) {
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -38,6 +38,9 @@ struct ggml_metal_context;
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);

+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -237,6 +237,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    free(ctx);
 }

+void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    if (result != 0) {
+        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+void ggml_metal_host_free(void * data) {
+    free(data);
+}
+
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = n_cb;
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -207,14 +207,18 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
-#define GGML_MAX_NAME          48
+#define GGML_MAX_NAME          64
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4

-
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1

+#define GGUF_MAGIC   0x47475546 // "GGUF"
+#define GGUF_VERSION 1
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
 #define GGML_UNUSED(x) (void)(x)

 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@ -562,6 +566,7 @@ extern "C" {
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

    GGML_API int     ggml_blck_size (enum ggml_type type);
@ -1494,7 +1499,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);

-
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
@ -1703,6 +1707,118 @@ extern "C" {

    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

+    //
+    // gguf
+    //
+
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
+
+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
+
+    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
+
+    // results are undefined if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
+    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
+
+    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
+
+    // overrides existing values or adds a new one
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+    // manage tensor info
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
+
    //
    // system info
    //
--- a/gguf.py
+++ b/gguf.py
@ -0,0 +1,669 @@
+import shutil
+import sys
+import struct
+import tempfile
+import numpy as np
+
+from enum import IntEnum, auto
+from typing import Any, IO, List
+
+#
+# constants
+#
+
+GGUF_MAGIC             = 0x47475546
+GGUF_VERSION           = 1
+GGUF_DEFAULT_ALIGNMENT = 32
+
+# general
+KEY_GENERAL_ARCHITECTURE         = "general.architecture"
+KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
+KEY_GENERAL_ALIGNMENT            = "general.alignment"
+KEY_GENERAL_NAME                 = "general.name"
+KEY_GENERAL_AUTHOR               = "general.author"
+KEY_GENERAL_URL                  = "general.url"
+KEY_GENERAL_DESCRIPTION          = "general.description"
+KEY_GENERAL_LICENSE              = "general.license"
+KEY_GENERAL_SOURCE_URL           = "general.source.url"
+KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
+
+# LLM
+KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length"
+KEY_LLM_EMBEDDING_LENGTH      = "{arch}.embedding_length"
+KEY_LLM_BLOCK_COUNT           = "{arch}.block_count"
+KEY_LLM_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
+KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
+KEY_LLM_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+
+# attention
+KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
+KEY_ATTENTION_HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
+KEY_ATTENTION_MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
+KEY_ATTENTION_CLAMP_KQV         = "{arch}.attention.clamp_kqv"
+KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
+KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+
+# RoPE
+KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
+KEY_ROPE_SCALE           = "{arch}.rope.scale"
+
+# tokenization
+KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
+KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
+KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
+KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
+KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
+KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
+KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
+KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
+KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
+KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
+KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
+KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
+
+#
+# recommended mapping of model tensor names for storage in gguf
+#
+
+
+class MODEL_ARCH(IntEnum):
+    LLAMA   = auto()
+    FALCON  = auto()
+    GPT2    = auto()
+    GPTJ    = auto()
+    GPTNEOX = auto()
+    MPT     = auto()
+
+
+class MODEL_TENSOR(IntEnum):
+    TOKEN_EMBD    = auto()
+    POS_EMBD      = auto()
+    OUTPUT        = auto()
+    OUTPUT_NORM   = auto()
+    ROPE_FREQS    = auto()
+    ATTN_Q        = auto()
+    ATTN_K        = auto()
+    ATTN_V        = auto()
+    ATTN_QKV      = auto()
+    ATTN_OUT      = auto()
+    ATTN_NORM     = auto()
+    ATTN_NORM_2   = auto()
+    ATTN_ROT_EMBD = auto()
+    FFN_GATE      = auto()
+    FFN_DOWN      = auto()
+    FFN_UP        = auto()
+    FFN_NORM      = auto()
+
+
+MODEL_ARCH_NAMES = {
+    MODEL_ARCH.LLAMA:   "llama",
+    MODEL_ARCH.FALCON:  "falcon",
+    MODEL_ARCH.GPT2:    "gpt2",
+    MODEL_ARCH.GPTJ:    "gptj",
+    MODEL_ARCH.GPTNEOX: "gptneox",
+    MODEL_ARCH.MPT:     "mpt",
+}
+
+MODEL_TENSOR_NAMES = {
+    MODEL_ARCH.LLAMA: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.FALCON: {
+        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
+        MODEL_TENSOR.OUTPUT:      "output",
+        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
+        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPT2: {
+        # TODO
+    },
+    # TODO
+}
+
+# tensors that will not be serialized
+MODEL_TENSOR_SKIP = {
+    MODEL_ARCH.LLAMA: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+}
+
+
+# TODO: the following helper functions should be removed
+#       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
+#       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
+# REMOVE
+def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
+    for skip in MODEL_TENSOR_SKIP.get(arch, []):
+        for i in range(n_blocks):
+            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
+                return True
+
+    return False
+
+
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
+    tensor_map = {}
+
+    # Token embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
+
+    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
+    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
+    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
+    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
+    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
+
+    # Position embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
+
+    tensor_map["transformer.wpe"] = mapped_to  # gpt2
+
+    # Output
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
+
+    tensor_map["embed_out"] = mapped_to  # gptneox
+    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
+    tensor_map["output"]    = mapped_to  # llama-pth
+
+    # Output norm
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
+
+    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
+    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
+    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
+    tensor_map["model.norm"]                = mapped_to  # llama-hf
+    tensor_map["norm"]                      = mapped_to  # llama-pth
+
+    # Rope frequencies
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
+
+    tensor_map["rope.freqs"] = mapped_to  # llama-pth
+
+    # Attention and feed-forward blocks
+    for i in range(0, n_blocks):
+        # Attention norm
+        # TODO: is there are simpler way to write these 2 lines in Python?
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
+        tensor_map["transformer.h."+str(i)+".ln_attn"]           = mapped_to  # falcon40b
+        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth
+
+        # Attention norm 2
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to  # falcon40b
+
+        # Attention query-key-value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon
+
+        # Attention query
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth
+
+        # Attention key
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth
+
+        # Attention value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth
+
+        # Attention output
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth
+
+        # Rotary embeddings
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth
+
+        # Feed-forward norm
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
+        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth
+
+        # Feed-forward up
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth
+
+        # Feed-forward gate
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth
+
+        # Feed-forward down
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth
+
+    return tensor_map
+
+#
+# implementation
+#
+
+
+class GGMLQuantizationType(IntEnum):
+    F32 = 0
+    F16 = 1
+
+
+class GGUFValueType(IntEnum):
+    UINT8   = 0
+    INT8    = 1
+    UINT16  = 2
+    INT16   = 3
+    UINT32  = 4
+    INT32   = 5
+    FLOAT32 = 6
+    BOOL    = 7
+    STRING  = 8
+    ARRAY   = 9
+
+    @staticmethod
+    def get_type(val):
+        if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
+            return GGUFValueType.STRING
+        elif isinstance(val, list):
+            return GGUFValueType.ARRAY
+        elif isinstance(val, float):
+            return GGUFValueType.FLOAT32
+        elif isinstance(val, bool):
+            return GGUFValueType.BOOL
+        elif isinstance(val, int):
+            return GGUFValueType.INT32
+        else:
+            print("Unknown type: "+str(type(val)))
+            sys.exit()
+
+
+class GGUFWriter:
+    def __init__(self, path: str, arch: str):
+        self.fout = open(path, "wb")
+        self.arch = arch
+        self.offset_tensor = 0
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
+        self.kv_data = b""
+        self.kv_data_count = 0
+        self.ti_data = b""
+        self.ti_data_count = 0
+        self.add_architecture()
+
+    def write_header_to_file(self):
+        self.fout.write(struct.pack("<I", GGUF_MAGIC))
+        self.fout.write(struct.pack("<I", GGUF_VERSION))
+        self.fout.write(struct.pack("<I", self.ti_data_count))
+        self.fout.write(struct.pack("<I", self.kv_data_count))
+        self.flush()
+#        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
+
+    def write_kv_data_to_file(self):
+        self.fout.write(self.kv_data)
+        self.flush()
+
+    def write_ti_data_to_file(self):
+        self.fout.write(self.ti_data)
+        self.flush()
+
+    def add_key(self, key: str):
+        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+
+    def add_uint8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT8)
+
+    def add_int8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT8)
+
+    def add_uint16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT16)
+
+    def add_int16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT16)
+
+    def add_uint32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT32)
+
+    def add_int32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT32)
+
+    def add_float32(self, key: str, val: float):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.FLOAT32)
+
+    def add_bool(self, key: str, val: bool):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.BOOL)
+
+    def add_string(self, key: str, val: str):
+        if len(val) == 0:
+            return
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.STRING)
+
+    def add_array(self, key: str, val: list):
+        if not isinstance(val, list):
+            raise ValueError("Value must be a list for array type")
+
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.ARRAY)
+
+    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
+        if vtype is None:
+            vtype = GGUFValueType.get_type(val)
+
+        if add_vtype:
+            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data_count += 1
+
+        if vtype == GGUFValueType.UINT8:
+            self.kv_data += struct.pack("<B", val)
+        elif vtype == GGUFValueType.INT8:
+            self.kv_data += struct.pack("<b", val)
+        elif vtype == GGUFValueType.UINT16:
+            self.kv_data += struct.pack("<H", val)
+        elif vtype == GGUFValueType.INT16:
+            self.kv_data += struct.pack("<h", val)
+        elif vtype == GGUFValueType.UINT32:
+            self.kv_data += struct.pack("<I", val)
+        elif vtype == GGUFValueType.INT32:
+            self.kv_data += struct.pack("<i", val)
+        elif vtype == GGUFValueType.FLOAT32:
+            self.kv_data += struct.pack("<f", val)
+        elif vtype == GGUFValueType.BOOL:
+            self.kv_data += struct.pack("?", val)
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf8") if isinstance(val, str) else val
+            self.kv_data += struct.pack("<I", len(encoded_val))
+            self.kv_data += encoded_val
+        elif vtype == GGUFValueType.ARRAY:
+            ltype = set([GGUFValueType.get_type(item) for item in val])
+            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
+            self.kv_data += struct.pack("<I", list(ltype)[0])
+            self.kv_data += struct.pack("<I", len(val))
+            for item in val:
+                self.add_val(item, add_vtype=False)
+        else:
+            raise ValueError("Invalid GGUF metadata value type")
+
+    @staticmethod
+    def ggml_pad(x: int, n: int) -> int:
+        return ((x + n - 1) // n) * n
+
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
+        assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
+
+        encoded_name = name.encode("utf8")
+        self.ti_data += struct.pack("<I", len(encoded_name))
+        self.ti_data += encoded_name
+        n_dims = len(tensor_shape)
+        self.ti_data += struct.pack("<I", n_dims)
+        for i in range(n_dims):
+            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
+
+        dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        self.ti_data += struct.pack("<I", dtype)
+        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
+        self.ti_data_count += 1
+
+    def add_tensor(self, name: str, tensor: np.ndarray):
+        if not hasattr(self, "temp_file"):
+            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
+            self.temp_file.seek(0)
+
+        self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
+
+        tensor.tofile(self.temp_file)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+        if pad != 0:
+            self.temp_file.write(bytes([0] * pad))
+
+    def write_tensor_data(self, tensor: np.ndarray):
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        tensor.tofile(self.fout)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+    def write_tensors_to_file(self):
+        self.write_ti_data_to_file()
+
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        self.temp_file.seek(0)
+
+        shutil.copyfileobj(self.temp_file, self.fout)
+        self.flush()
+        self.temp_file.close()
+
+    def flush(self):
+        self.fout.flush()
+
+    def close(self):
+        self.fout.close()
+
+    def add_architecture(self):
+        self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
+
+    def add_author(self, author: str):
+        self.add_string(KEY_GENERAL_AUTHOR, author)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_url(self, url: str):
+        self.add_string(KEY_GENERAL_URL, url)
+
+    def add_description(self, description: str):
+        self.add_string(KEY_GENERAL_DESCRIPTION, description)
+
+    def add_source_url(self, url: str):
+        self.add_string(KEY_GENERAL_SOURCE_URL, url)
+
+    def add_source_hf_repo(self, repo: str):
+        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
+
+    def add_name(self, name: str):
+        self.add_string(KEY_GENERAL_NAME, name)
+
+    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
+        self.add_uint32(
+            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
+
+    def add_custom_alignment(self, alignment: int):
+        self.data_alignment = alignment
+        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
+
+    def add_context_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
+
+    def add_embedding_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_block_count(self, length: int):
+        self.add_uint32(
+            KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_feed_forward_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_parallel_residual(self, use: bool):
+        self.add_bool(
+            KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(
+            KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_head_count(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
+
+    def add_head_count_kv(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
+
+    def add_max_alibi_bias(self, bias: float):
+        self.add_float32(
+            KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
+
+    def add_clamp_kqv(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
+
+    def add_layer_norm_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
+
+    def add_layer_norm_rms_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
+
+    def add_rope_dimension_count(self, count: int):
+        self.add_uint32(
+            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
+
+    def add_rope_scale(self, value:  float):
+        self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
+
+    def add_tokenizer_model(self, model: str):
+        self.add_string(KEY_TOKENIZER_MODEL, model)
+
+    def add_token_list(self, tokens: List):
+        self.add_array(KEY_TOKENIZER_LIST, tokens)
+
+    def add_token_merges(self, merges: List):
+        self.add_array(KEY_TOKENIZER_MERGES, merges)
+
+    def add_token_types(self, types: List[int]):
+        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
+
+    def add_token_scores(self, scores: List[float]):
+        self.add_array(KEY_TOKENIZER_SCORES, scores)
+
+    def add_bos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
+
+    def add_eos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
+
+    def add_unk_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
+
+    def add_sep_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
+
+    def add_pad_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
+
+
+# Example usage:
+if __name__ == "__main__":
+    # Example usage with a file
+    gguf_writer = GGUFWriter("example.gguf", "llama")
+
+    gguf_writer.add_architecture()
+    gguf_writer.add_block_count(12)
+    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
+    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
+    gguf_writer.add_custom_alignment(64)
+
+    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
+    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
+    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
+
+    gguf_writer.add_tensor("tensor1", tensor1)
+    gguf_writer.add_tensor("tensor2", tensor2)
+    gguf_writer.add_tensor("tensor3", tensor3)
+
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+
+    gguf_writer.close()
--- a/llama-util.h
+++ b/llama-util.h
@ -1,553 +0,0 @@
-// Internal header to be included only by llama.cpp.
-// Contains wrappers around OS interfaces.
-
-#ifndef LLAMA_UTIL_H
-#define LLAMA_UTIL_H
-
-#include <cstdio>
-#include <cstdint>
-#include <cerrno>
-#include <cstring>
-#include <cstdarg>
-#include <cstdlib>
-#include <climits>
-
-#include <string>
-#include <vector>
-#include <stdexcept>
-
-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/mman.h>
-        #endif
-        #if defined(_POSIX_MEMLOCK_RANGE)
-            #include <sys/resource.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <io.h>
-    #include <stdio.h> // for _fseeki64
-#endif
-
-#define LLAMA_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    LLAMA_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        LLAMA_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, len, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-// llama_context_data
-struct llama_data_context {
-    virtual void write(const void * src, size_t size) = 0;
-    virtual size_t get_size_written() = 0;
-    virtual ~llama_data_context() = default;
-};
-
-struct llama_data_buffer_context : llama_data_context {
-    uint8_t* ptr;
-    size_t size_written = 0;
-
-    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
-
-    void write(const void * src, size_t size) override {
-        memcpy(ptr, src, size);
-        ptr += size;
-        size_written += size;
-    }
-
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
-
-struct llama_data_file_context : llama_data_context {
-    llama_file* file;
-    size_t size_written = 0;
-
-    llama_data_file_context(llama_file * f) : file(f) {}
-
-    void write(const void * src, size_t size) override {
-        file->write_raw(src, size);
-        size_written += size;
-    }
-
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
-
-#if defined(_WIN32)
-static std::string llama_format_win_err(DWORD err) {
-    LPSTR buf;
-    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
-    if (!size) {
-        return "FormatMessageA failed";
-    }
-    std::string ret(buf, size);
-    LocalFree(buf);
-    return ret;
-}
-#endif
-
-struct llama_mmap {
-    void * addr;
-    size_t size;
-
-    llama_mmap(const llama_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
-#ifdef __linux__
-        if (prefetch >= file->size) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~llama_mmap() {
-        munmap(addr, size);
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) numa;
-
-        size = file->size;
-
-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
-
-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();
-
-        if (hMapping == NULL) {
-            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
-        }
-
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
-        CloseHandle(hMapping);
-
-        if (addr == NULL) {
-            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
-        }
-
-        if (prefetch) {
-            // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
-            // will dynamically load it using GetProcAddress.
-            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
-            HMODULE hKernel32;
-
-            // This call is guaranteed to succeed.
-            hKernel32 = GetModuleHandleW(L"kernel32.dll");
-
-            // This call may fail if on a pre-Win8 system.
-            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
-
-            if (pPrefetchVirtualMemory) {
-                // Advise the kernel to preload the mapped memory.
-                WIN32_MEMORY_RANGE_ENTRY range;
-                range.VirtualAddress = addr;
-                range.NumberOfBytes = (SIZE_T)size;
-                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                            llama_format_win_err(GetLastError()).c_str());
-                }
-            }
-        }
-    }
-
-    ~llama_mmap() {
-        if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
-        (void) prefetch;
-        (void) numa;
-
-        throw std::runtime_error(std::string("mmap not supported"));
-    }
-#endif
-};
-
-// Represents some region of memory being locked using mlock or VirtualLock;
-// will automatically unlock on destruction.
-struct llama_mlock {
-    void * addr = NULL;
-    size_t size = 0;
-    bool failed_already = false;
-
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
-
-    ~llama_mlock() {
-        if (size) {
-            raw_unlock(addr, size);
-        }
-    }
-
-    void init(void * ptr) {
-        LLAMA_ASSERT(addr == NULL && size == 0);
-        addr = ptr;
-    }
-
-    void grow_to(size_t target_size) {
-        LLAMA_ASSERT(addr);
-        if (failed_already) {
-            return;
-        }
-        size_t granularity = lock_granularity();
-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
-        if (target_size > size) {
-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
-                size = target_size;
-            } else {
-                failed_already = true;
-            }
-        }
-    }
-
-#ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        return (size_t) sysconf(_SC_PAGESIZE);
-    }
-
-    #ifdef __APPLE__
-        #define MLOCK_SUGGESTION \
-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
-    #else
-        #define MLOCK_SUGGESTION \
-            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
-    #endif
-
-    bool raw_lock(const void * addr, size_t size) {
-        if (!mlock(addr, size)) {
-            return true;
-        } else {
-            char* errmsg = std::strerror(errno);
-            bool suggest = (errno == ENOMEM);
-
-            // Check if the resource limit is fine after all
-            struct rlimit lock_limit;
-            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
-                suggest = false;
-            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
-                suggest = false;
-
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
-            return false;
-        }
-    }
-
-    #undef MLOCK_SUGGESTION
-
-    void raw_unlock(void * addr, size_t size) {
-        if (munlock(addr, size)) {
-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
-        }
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return (size_t) si.dwPageSize;
-    }
-
-    bool raw_lock(void * ptr, size_t len) {
-        for (int tries = 1; ; tries++) {
-            if (VirtualLock(ptr, len)) {
-                return true;
-            }
-            if (tries == 2) {
-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                    len, size, llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-
-            // It failed but this was only the first try; increase the working
-            // set size and try again.
-            SIZE_T min_ws_size, max_ws_size;
-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-            // Per MSDN: "The maximum number of pages that a process can lock
-            // is equal to the number of pages in its minimum working set minus
-            // a small overhead."
-            // Hopefully a megabyte is enough overhead:
-            size_t increment = len + 1048576;
-            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += increment;
-            max_ws_size += increment;
-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-        }
-    }
-
-    void raw_unlock(void * ptr, size_t len) {
-        if (!VirtualUnlock(ptr, len)) {
-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    size_t lock_granularity() {
-        return (size_t) 65536;
-    }
-
-    bool raw_lock(const void * addr, size_t len) {
-        fprintf(stderr, "warning: mlock not supported on this system\n");
-        return false;
-    }
-
-    void raw_unlock(const void * addr, size_t len) {}
-#endif
-};
-
-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct llama_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    llama_buffer() = default;
-
-    void resize(size_t len) {
-#ifdef GGML_USE_METAL
-        free(addr);
-        int result = posix_memalign((void **) &addr, getpagesize(), len);
-        if (result == 0) {
-            memset(addr, 0, len);
-        }
-        else {
-            addr = NULL;
-        }
-#else
-        delete[] addr;
-        addr = new uint8_t[len];
-#endif
-        size = len;
-    }
-
-    ~llama_buffer() {
-#ifdef GGML_USE_METAL
-        free(addr);
-#else
-        delete[] addr;
-#endif
-        addr = NULL;
-    }
-
-    // disable copy and move
-    llama_buffer(const llama_buffer&) = delete;
-    llama_buffer(llama_buffer&&) = delete;
-    llama_buffer& operator=(const llama_buffer&) = delete;
-    llama_buffer& operator=(llama_buffer&&) = delete;
-};
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-struct llama_ctx_buffer {
-    uint8_t * addr = NULL;
-    bool is_cuda;
-    size_t size = 0;
-
-    llama_ctx_buffer() = default;
-
-    void resize(size_t size) {
-        free();
-
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
-        if (addr) {
-            is_cuda = true;
-        }
-        else {
-            // fall back to pageable memory
-            addr = new uint8_t[size];
-            is_cuda = false;
-        }
-        this->size = size;
-    }
-
-    void free() {
-        if (addr) {
-            if (is_cuda) {
-                ggml_cuda_host_free(addr);
-            }
-            else {
-                delete[] addr;
-            }
-        }
-        addr = NULL;
-    }
-
-    ~llama_ctx_buffer() {
-        free();
-    }
-
-    // disable copy and move
-    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
-    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
-    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
-    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
-};
-#else
-typedef llama_buffer llama_ctx_buffer;
-#endif
-
-#endif
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -34,29 +34,18 @@
 #    define DEPRECATED(func, hint) func
 #endif

-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
-#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
-#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
-#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
-#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+#define LLAMA_DEFAULT_SEED 0xFFFFFFFF

-#define LLAMA_FILE_VERSION           3
-#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
-#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
-#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION        1
+#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

-#define LLAMA_DEFAULT_SEED           0xFFFFFFFF
+#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
+#define LLAMA_SESSION_VERSION 1

 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif

-#ifndef LLAMA_DEFAULT_RMS_EPS
-#define LLAMA_DEFAULT_RMS_EPS 5e-6f
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -103,8 +92,6 @@ extern "C" {
        uint32_t seed;         // RNG seed, -1 for random
        int32_t  n_ctx;        // text context
        int32_t  n_batch;      // prompt processing batch size
-        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
-        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
        int32_t  n_gpu_layers; // number of layers to store in VRAM
        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors

@ -129,6 +116,7 @@ extern "C" {
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only
    };
+
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32              = 0,
@ -155,7 +143,7 @@ extern "C" {
    // model quantization parameters
    typedef struct llama_model_quantize_params {
        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
    } llama_model_quantize_params;
@ -208,17 +196,12 @@ extern "C" {
        int32_t n_eval;
    };

-    // Set callback for all future logging events.
-    // If this is not called, or NULL is supplied, everything is output on stderr.
-    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
+    LLAMA_API struct llama_context_params llama_context_default_params(void);
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);

-    LLAMA_API int llama_max_devices();
-
-    LLAMA_API struct llama_context_params llama_context_default_params();
-    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
-
-    LLAMA_API bool llama_mmap_supported();
-    LLAMA_API bool llama_mlock_supported();
+    LLAMA_API int  llama_max_devices(void);
+    LLAMA_API bool llama_mmap_supported(void);
+    LLAMA_API bool llama_mlock_supported(void);

    // TODO: not great API - very likely to change
    // Initialize the llama + ggml backend
@ -226,9 +209,9 @@ extern "C" {
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(bool numa);
    // Call once at the end of the program - currently only used for MPI
-    LLAMA_API void llama_backend_free();
+    LLAMA_API void llama_backend_free(void);

-    LLAMA_API int64_t llama_time_us();
+    LLAMA_API int64_t llama_time_us(void);

    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
@ -240,13 +223,6 @@ extern "C" {
                     struct llama_model * model,
            struct llama_context_params   params);

-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params),
-            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");

    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
@ -336,6 +312,13 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);

+    LLAMA_API int llama_tokenize_bpe(
+            struct llama_context * ctx,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
+
    LLAMA_API int llama_tokenize_with_model(
        const struct llama_model * model,
                      const char * text,
@ -377,18 +360,28 @@ extern "C" {
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(
+    // Does not write null terminator to the buffer
+    LLAMA_API int llama_token_to_str(
            const struct llama_context * ctx,
-                           llama_token   token);
+                           llama_token   token,
+                                  char * buf,
+                                  int    length);

-    LLAMA_API const char * llama_token_to_str_with_model(
+    LLAMA_API int llama_token_to_str_bpe(
+            const struct llama_context * ctx,
+                           llama_token   token,
+                                  char * buf,
+                                  int    length);
+
+    LLAMA_API int llama_token_to_str_with_model(
              const struct llama_model * model,
-                           llama_token   token);
-
+                           llama_token   token,
+                                  char * buf,
+                                  int    length);
    // Special tokens
-    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl();   // next-line
+    LLAMA_API llama_token llama_token_bos(void);  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(void);  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl(void);   // next-line

    // Grammar
    //
@ -468,19 +461,51 @@ extern "C" {
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);

+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
+
 #ifdef __cplusplus
 }
 #endif

-// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
-#ifdef LLAMA_API_INTERNAL
+// C++ API, will be moving to common.h soon (TM)
+#ifdef LLAMA_API_CPP

 #include <vector>
 #include <string>
+
+//
+// Vocab utils
+//
+
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::string llama_token_to_str(
+        const struct llama_context * ctx,
+                       llama_token   token);
+
+std::string llama_token_to_str_bpe(
+    const struct llama_context * ctx,
+                   llama_token   token);
+
+// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
+#ifdef LLAMA_API_INTERNAL
+
 struct ggml_tensor;

 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);

-#endif
+#endif // LLAMA_API_CPP
+
+#endif // LLAMA_API_INTERNAL

 #endif // LLAMA_H
--- a/models/.editorconfig
+++ b/models/.editorconfig
@ -0,0 +1 @@
+root = true
--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/models/ggml-vocab.bin
+++ b/models/ggml-vocab.bin
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,4 +1,19 @@
-function(llama_add_test source)
+function(llama_build_executable source)
+    get_filename_component(TEST_TARGET ${source} NAME_WE)
+    add_executable(${TEST_TARGET} ${source})
+    install(TARGETS ${TEST_TARGET} RUNTIME)
+    target_link_libraries(${TEST_TARGET} PRIVATE llama)
+endfunction()
+
+function(llama_test_executable name source)
+    get_filename_component(TEST_TARGET ${source} NAME_WE)
+    # add_executable(${TEST_TARGET} ${source})
+    # install(TARGETS ${TEST_TARGET} RUNTIME)
+    # target_link_libraries(${TEST_TARGET} PRIVATE llama)
+    add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+endfunction()
+
+function(llama_build_and_test_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
@ -6,12 +21,16 @@ function(llama_add_test source)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()

-# llama_add_test(test-double-float.cpp) # SLOW
-llama_add_test(test-quantize-fns.cpp)
-llama_add_test(test-quantize-perf.cpp)
-llama_add_test(test-sampling.cpp)
-llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-llama_add_test(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
-llama_add_test(test-llama-grammar.cpp  ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/common.cpp)
-llama_add_test(test-grad0.cpp) # SLOW
-# llama_add_test(test-opt.cpp) # SLOW
+# llama_build_and_test_executable(test-double-float.cpp) # SLOW
+llama_build_and_test_executable(test-quantize-fns.cpp)
+llama_build_and_test_executable(test-quantize-perf.cpp)
+llama_build_and_test_executable(test-sampling.cpp)
+llama_build_executable(test-tokenizer-0.cpp)
+llama_test_executable(test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_build_executable(test-tokenizer-1.cpp)
+llama_test_executable(test-tokenizer-1.llama  test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_build_and_test_executable(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
+llama_build_and_test_executable(test-llama-grammar.cpp  ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/common.cpp)
+llama_build_and_test_executable(test-grad0.cpp) # SLOW
+# llama_build_and_test_executable(test-opt.cpp) # SLOW
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -1,3 +1,4 @@
+#define LLAMA_API_CPP // TODO: eliminate me
 #include "llama.h"

 #include <cstdio>
@ -5,16 +6,40 @@
 #include <map>
 #include <vector>

-static const std::map<std::string, std::vector<llama_token>> & k_tests()
-{
+static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
+    std::string result;
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        result += llama_token_to_str(ctx, tokens[i]);
+    }
+    return result;
+}
+
+static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { "Hello World",        { 1,  10994,   2787, }, },
-        { " Hello World",       { 1,  15043,   2787, }, },
-        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
-        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+        { " ",                      {1,    259, }, },
+        { "\t",                     { 1,    29871,   12, }, },
+        { "\n",                     { 1,    29871,   13, }, },
+        { "\t\n",                   { 1,    29871,   12,     13, }, },
+        { "Hello world",            { 1,  15043,   3186, }, },
+        { " Hello world",           { 1,  29871,  15043,   3186, }, },
+        { "Hello World",            { 1,  15043,   2787, }, },
+        { " Hello World",           { 1,  29871,  15043,   2787, }, },
+        { " Hello World!",          { 1,  29871,  15043,   2787,  29991, }, },
+        { " this is 🦙.cpp",        { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        { "w048 7tuijk dsdfhu",     { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        { "нещо на Български",      { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
+        { "កាន់តែពិសេសអាចខលចេញ",   { 1,  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
+                                     146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
+                                     31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
+                                     161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
+                                     136,    228,    162,    132,    228,    161,    140, }, },
+        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+            { 1,  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
+                243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
+                313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
+                313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
    };
+
    return _k_tests;
 };

@ -64,10 +89,12 @@ int main(int argc, char **argv) {
        return 2;
    }

+    bool success = true;
+
    for (const auto & test_kv : k_tests()) {
-        std::vector<llama_token> res(test_kv.first.size());
-        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
-        res.resize(n);
+        std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
+        fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
+            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());

        bool correct = res.size() == test_kv.second.size();

@ -78,7 +105,8 @@ int main(int argc, char **argv) {
        }

        if (!correct) {
-            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
@ -90,9 +118,7 @@ int main(int argc, char **argv) {
            }
            fprintf(stderr, "\n");

-            llama_free_model(model);
-            llama_free(ctx);
-            return 3;
+            success = false;
        }
    }

@ -101,5 +127,5 @@ int main(int argc, char **argv) {

    llama_backend_free();

-    return 0;
+    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@ -0,0 +1,128 @@
+#define LLAMA_API_CPP // TODO: eliminate me
+#include "llama.h"
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <codecvt>
+#include <map>
+#include <vector>
+#include <locale>
+
+static std::string vocab_type(llama_context * ctx) {
+    return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
+}
+
+static std::string escape_whitespace(const std::string& text) {
+    std::string result;
+    bool escaping = false;
+    result += "\xe2\x96\x81";
+    for (size_t offs = 0; offs < text.length(); ++offs) {
+        if (text[offs] == ' ') {
+            if (!escaping) {
+                result += "\xe2\x96\x81";
+                escaping = true;
+            }
+        }
+        else {
+            escaping = false;
+            result += text[offs];
+        }
+    }
+    return result;
+}
+
+static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::string result;
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        result += llama_token_to_str(ctx, tokens[i]);
+    }
+    return result;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init(false);
+
+    // load the vocab
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.vocab_only = true;
+
+        model = llama_load_model_from_file(fname.c_str(), lparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        ctx = llama_new_context_with_model(model, lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_free_model(model);
+            return 1;
+        }
+    }
+
+    const int n_vocab = llama_n_vocab(ctx);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        std::string forward = llama_token_to_str_bpe(ctx, i);
+        std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
+        if (tokens.size() == 1) {
+            if (i != tokens[0]) {
+                std::string backward = llama_token_to_str(ctx, tokens[0]);
+                fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
+                    __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
+                return 2;
+            }
+        } else {
+            if ((vocab_type(ctx) == "spm" && i <= 258) ||
+                (vocab_type(ctx) == "bpe" && (i == 0 || i >= 100000))) {
+                fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
+                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
+            } else {
+                fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
+                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
+                return 2;
+            }
+        }
+    }
+
+    std::wstring_convert<typename std::codecvt_utf8<wchar_t>, wchar_t> converter;
+    for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
+        std::wstring wstr(1, ch);
+        std::string str;
+        try {
+            str = converter.to_bytes(wstr);
+        } catch (std::exception & e) {
+            continue;
+        }
+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
+        if (tokens.size() == 1) {
+            fprintf(stderr, "%s : info: %s tokenized to %d \n",
+                __func__, str.c_str(), tokens[0]);
+        }
+    }
+
+    llama_free_model(model);
+    llama_free(ctx);
+
+    llama_backend_free();
+
+    return 0;
+}