Merge 'origin/master' into hipblas

2023-08-22 01:03:44 +03:00 · 2023-08-22 01:03:44 +03:00 · 423db742e7
commit 423db742e7
parent c88c2a992a c8dba409e6
66 changed files with 13992 additions and 4771 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.o
 *.a
 *.so
 *.gguf
 *.bin
 .DS_Store
 .build/
@ -47,7 +48,10 @@ models-mnt
 /server
 /Pipfile
 /embd-input-test
 /gguf
 /gguf-llama-simple
 /libllama.so
 /llama-bench
 build-info.h
 arm_neon.h
 compile_commands.json
@ -64,7 +68,6 @@ perf-*.txt
 examples/jeopardy/results.txt
 pyproject.toml
 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -297,7 +297,6 @@ if (LLAMA_METAL)
    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
@ -314,7 +313,6 @@ if (LLAMA_METAL)
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
        ${METALPERFORMANCE_FRAMEWORK}
        )
 endif()
@ -537,9 +535,11 @@ else()
 endif()
 #
-# Build libraries
+# libraries
 #
 # ggml
 add_library(ggml OBJECT
            ggml.c
            ggml.h
@ -564,10 +564,11 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS ggml_shared LIBRARY)
 endif()
 # llama
 add_library(llama
            llama.cpp
            llama.h
            llama-util.h
            )
 target_include_directories(llama PUBLIC .)
@ -586,6 +587,10 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS llama LIBRARY)
 endif()
 #
 # install
 #
 include(GNUInstallDirs)
 install(
    FILES convert.py
@ -609,11 +614,23 @@ install(
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
 if (LLAMA_METAL)
    install(
        FILES ggml-metal.metal
        PERMISSIONS
            OWNER_READ
            OWNER_WRITE
            GROUP_READ
            WORLD_READ
        DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
 #
 # programs, examples and tests
 #
 add_subdirectory(common)
 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
--- a/33
+++ b/33
@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench
 # Binaries only useful for tests
-TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
 default: $(BUILD_TARGETS)
@ -45,8 +45,8 @@ OPT = -Ofast
 else
 OPT = -O3
 endif
-CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
+CFLAGS   = -I.            $(OPT) -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
+CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
 LDFLAGS  =
 ifdef LLAMA_DEBUG
@ -307,7 +307,7 @@ endif # LLAMA_HIPBLAS
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL
-	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
 	OBJS     += ggml-metal.o
 endif # LLAMA_METAL
@ -353,23 +353,23 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 OBJS += ggml-alloc.o
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-common.o: examples/common.cpp examples/common.h
+common.o: common/common.cpp common/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-console.o: examples/console.cpp examples/console.h
+console.o: common/console.cpp common/console.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
+grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)
 #
 # Examples
@ -409,12 +409,18 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@ -436,7 +442,10 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
--- a/README.md
+++ b/README.md
@ -9,13 +9,19 @@
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
-**Hot topics:**
+### Hot topics
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
+A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
+
- New roadmap: https://github.com/users/ggerganov/projects/7
+Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
+
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
+### Current `master` should be considered in Beta - expect some issues for a few days!
 ### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
 ### Issues with non-GGUF models will be considered with low priority!
 ----
 <details>
  <summary>Table of Contents</summary>
@ -96,8 +102,10 @@ as the main playground for developing new features for the [ggml](https://github
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 **UI:**
@ -238,12 +246,17 @@ In order to build llama.cpp you have three different options.
    cmake --build . --config Release
    ```
- Using `Zig`:
+- Using `Zig` (version 0.11 or later):
    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
    it's also possible to cross compile for other operating systems and architectures:
    ```bash
-    zig build -Doptimize=ReleaseFast
+    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
    ```
    The `zig targets` command will give you valid options to use.
 -   Using `gmake` (FreeBSD):
    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@ -284,7 +297,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
 Any value larger than 0 will offload the computation to the GPU. For example:
 ```bash
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
 ```
 ### MPI Build
@ -323,7 +336,7 @@ The above will distribute the computation across 2 processes on the first host a
 Finally, you're ready to run a computation using `mpirun`:
 ```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 ### BLAS Build
@ -408,7 +421,7 @@ Building the program with BLAS support may lead to some performance improvements
  |-------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
@ -535,10 +548,10 @@ python3 convert.py models/7B/
  python convert.py models/7B/ --vocabtype bpe
 # quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -594,7 +607,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh
 # custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@ -657,6 +670,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 *Note: these instructions are likely obsoleted by the GGUF update*
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@ -732,7 +747,7 @@ If your issue is with model generation quality, then please at least scan the fo
 #### How to run
 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
@ -831,13 +846,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
 On completion, you are ready to play!
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a light image:
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 ### Docker With CUDA
@ -868,8 +883,8 @@ The resulting images, are essentially the same as the non-CUDA images:
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 ```
 ### Contributing
--- a/build.zig
+++ b/build.zig
@ -1,5 +1,6 @@
 // Compatible with Zig Version 0.11.0
 const std = @import("std");
 const ArrayList = std.ArrayList;
 const Compile = std.Build.Step.Compile;
 const ConfigHeader = std.Build.Step.ConfigHeader;
 const Mode = std.builtin.Mode;
@ -10,11 +11,31 @@ const Maker = struct {
    target: CrossTarget,
    optimize: Mode,
    config_header: *ConfigHeader,
    enable_lto: bool,
-    const cflags = .{"-std=c11"};
+    include_dirs: ArrayList([]const u8),
-    const cxxflags = .{"-std=c++11"};
+    cflags: ArrayList([]const u8),
    cxxflags: ArrayList([]const u8),
    objs: ArrayList(*Compile),
-    fn init(builder: *std.build.Builder) Maker {
+    fn addInclude(m: *Maker, dir: []const u8) !void {
        try m.include_dirs.append(dir);
    }
    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
    }
    fn addCFlag(m: *Maker, flag: []const u8) !void {
        try m.cflags.append(flag);
    }
    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
        try m.cxxflags.append(flag);
    }
    fn addFlag(m: *Maker, flag: []const u8) !void {
        try m.addCFlag(flag);
        try m.addCxxFlag(flag);
    }
    fn init(builder: *std.build.Builder) !Maker {
        const commit_hash = @embedFile(".git/refs/heads/master");
        const config_header = builder.addConfigHeader(
            .{ .style = .blank, .include_path = "build-info.h" },
@ -23,58 +44,71 @@ const Maker = struct {
                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
            },
        );
-        return Maker{
+        var m = Maker{
            .builder = builder,
            .target = builder.standardTargetOptions(.{}),
            .optimize = builder.standardOptimizeOption(.{}),
            .config_header = config_header,
            .enable_lto = false,
            .include_dirs = ArrayList([]const u8).init(builder.allocator),
            .cflags = ArrayList([]const u8).init(builder.allocator),
            .cxxflags = ArrayList([]const u8).init(builder.allocator),
            .objs = ArrayList(*Compile).init(builder.allocator),
        };
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
        try m.addProjectInclude(&.{"examples"});
        return m;
    }
    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (std.mem.endsWith(u8, src, ".c")) {
-            o.addCSourceFiles(&.{src}, &cflags);
+            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
-            o.addCSourceFiles(&.{src}, &cxxflags);
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
            o.linkLibCpp();
        }
-        o.addIncludePath(.{ .path = "." });
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
-        o.addIncludePath(.{ .path = "./examples" });
+        o.want_lto = m.enable_lto;
        return o;
    }
    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        e.addIncludePath(.{ .path = "." });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
        e.addIncludePath(.{ .path = "./examples" });
        e.addCSourceFiles(&.{src}, &cxxflags);
        for (deps) |d| e.addObject(d);
        for (m.objs.items) |o| e.addObject(o);
        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
        e.linkLibC();
        e.linkLibCpp();
        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
-
+        e.want_lto = m.enable_lto;
        // Currently a bug is preventing correct linking for optimized builds for Windows:
        // https://github.com/ziglang/zig/issues/15958
        if (e.target.isWindows()) {
            e.want_lto = false;
        }
        return e;
    }
 };
-pub fn build(b: *std.build.Builder) void {
+pub fn build(b: *std.build.Builder) !void {
-    const make = Maker.init(b);
+    var make = try Maker.init(b);
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
        try make.addFlag("-DGGML_USE_K_QUANTS");
        const k_quants = make.obj("k_quants", "k_quants.c");
        try make.objs.append(k_quants);
    }
    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const llama = make.obj("llama", "llama.cpp");
    const common = make.obj("common", "examples/common.cpp");
    const console = make.obj("common", "examples/console.cpp");
    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
--- a/ci/run.sh
+++ b/ci/run.sh
@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
    python3 ../convert.py ${path_models}
-    model_f16="${path_models}/ggml-model-f16.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {
    python3 ../convert.py ${path_models}
-    model_f16="${path_models}/ggml-model-f16.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test="${path_wiki}/wiki.test.raw"
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -0,0 +1,20 @@
 # common
 set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
    console.h
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
    )
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
 target_link_libraries(${TARGET} PRIVATE llama)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "-gqa" || arg == "--gqa") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -274,6 +262,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.cfg_negative_prompt = argv[i];
        } else if (arg == "--cfg-negative-prompt-file") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::ifstream file(argv[i]);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                invalid_param = true;
                break;
            }
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
            if (params.cfg_negative_prompt.back() == '\n') {
                params.cfg_negative_prompt.pop_back();
            }
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_param = true;
@ -424,7 +427,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
+            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
            params.penalize_nl = false;
        } else if (arg == "-l" || arg == "--logit-bias") {
@ -546,8 +549,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@ -567,8 +568,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
-    fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT\n");
    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
    fprintf(stdout, "  --cfg-negative-prompt-file FNAME\n");
    fprintf(stdout, "                        negative prompt file to use for guidance. (default: empty)\n");
    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
@ -633,24 +636,15 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    return "The";
 }
-// TODO: not great allocating this every time
+//
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+// Model utils
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+//
    std::vector<llama_token> res(text.size() + (int) add_bos);
    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
    assert(n >= 0);
    res.resize(n);
    return res;
 }
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();
    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
    lparams.n_gqa           = params.n_gqa;
    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
@ -668,7 +662,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    return lparams;
 }
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto lparams = llama_context_params_from_gpt_params(params);
    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
@ -697,5 +691,77 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        }
    }
    if (params.ignore_eos) {
        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }
    return std::make_tuple(model, lctx);
 }
 //
 // Vocab utils
 //
 std::vector<llama_token> llama_tokenize(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return result;
 }
 std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_token_to_str(ctx, token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return std::string(result.data(), result.size());
 }
 std::vector<llama_token> llama_tokenize_bpe(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos) {
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
    n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return result;
 }
 std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return std::string(result.data(), result.size());
 }
--- a/examples/common.h
+++ b/examples/common.h
@ -22,19 +22,16 @@ struct gpt_params {
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
@ -48,12 +45,14 @@ struct gpt_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt;       // string to help guidance
    float       cfg_scale         = 1.f;   // How strong is guidance
-    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
@ -83,6 +82,7 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
@ -100,15 +100,31 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 //
 // Vocab utils
 //
 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
 //
 // Model utils
 //
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 //
 // Vocab utils
 //
 std::vector<llama_token> llama_tokenize(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos);
 std::vector<llama_token> llama_tokenize_bpe(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos);
 std::string llama_token_to_str(
        const struct llama_context * ctx,
                       llama_token   token);
 std::string llama_token_to_str_bpe(
    const struct llama_context * ctx,
                   llama_token   token);
--- a/examples/console.cpp
+++ b/examples/console.cpp
--- a/examples/console.h
+++ b/examples/console.h
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -0,0 +1,282 @@
 # HF falcon--> gguf conversion
 import gguf
 import os
 import sys
 import struct
 import json
 import numpy as np
 import torch
 from typing import Any, List
 from pathlib import Path
 from transformers import AutoTokenizer
 def bytes_to_unicode():
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
 dir_model = sys.argv[1]
 last_dir = os.path.basename(os.path.normpath(dir_model))
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
 fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
 print("gguf: loading model "+last_dir)
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "RWForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.FALCON
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["n_layer"]
 gguf_writer.add_name(last_dir)
 gguf_writer.add_context_length(2048) # not in config.json
 gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_head_count(hparams["n_head"])
 if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: List[str] = []
 merges: List[str] = []
 if Path(dir_model + "/tokenizer.json").is_file():
    # gpt2 tokenizer
    gguf_writer.add_tokenizer_model("gpt2")
    print("gguf: get gpt2 tokenizer merges")
    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
        tokenizer_json = json.load(f)
    merges = tokenizer_json["model"]["merges"]
    gguf_writer.add_token_merges(merges)
    print("gguf: get gpt2 tokenizer vocab")
    vocab_size = len(tokenizer_json["model"]["vocab"])
    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
    tokenizer = AutoTokenizer.from_pretrained(dir_model)
    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
    byte_encoder = bytes_to_unicode()
    byte_decoder = {v: k for k, v in byte_encoder.items()}
    for i in range(vocab_size):
        if i in reverse_vocab:
            try:
                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
            except KeyError:
                text = bytearray()
                for c in reverse_vocab[i]:
                    if ord(c) < 256:  # single byte character
                        text.append(byte_decoder[ord(c)])
                    else:  # multibyte special token character
                        text.extend(c.encode('utf-8'))
        else:
            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
            pad_token = f"[PAD{i}]".encode("utf8")
            text = bytearray(pad_token)
        tokens.append(text)
    gguf_writer.add_token_list(tokens)
    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
        print("gguf: get special token ids")
        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
            tokenizer_config = json.load(f)
        # find special token ids
        if "bos_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["bos_token"]:
                    gguf_writer.add_bos_token_id(key["id"])
        if "eos_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["eos_token"]:
                    gguf_writer.add_eos_token_id(key["id"])
        if "unk_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["unk_token"]:
                    gguf_writer.add_unk_token_id(key["id"])
        if "sep_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["sep_token"]:
                    gguf_writer.add_sep_token_id(key["id"])
        if "pad_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["pad_token"]:
                    gguf_writer.add_pad_token_id(key["id"])
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # params for qkv transform
 n_head = hparams["n_head"]
 n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
 head_dim = hparams["hidden_size"] // n_head
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = ("pytorch_model.bin",)
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        # QKV tensor transform
        # The original query_key_value tensor contains n_head_kv "kv groups",
        # each consisting of n_head/n_head_kv query weights followed by one key
        # and one value weight (shared by all query heads in the kv group).
        # This layout makes it a big pain to work with in GGML.
        # So we rearrange them here,, so that we have n_head query weights
        # followed by n_head_kv key weights followed by n_head_kv value weights,
        # in contiguous fashion.
        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
        if "query_key_value" in name:
            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
            data = torch.cat((q,k,v)).reshape_as(data)
        data = data.squeeze().numpy()
        # map tensor names
        if name.endswith(".weight") and name[:-7] in tensor_map:
            name = tensor_map[name[:-7]] + ".weight"
        elif name.endswith(".bias") and name[:-5] in tensor_map:
            name = tensor_map[name[:-5]] + ".bias"
        else:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 print("gguf: write tensors")
 gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print("gguf: model successfully exported to '" + fname_out + "'")
 print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@ -0,0 +1,266 @@
 # HF gptneox--> gguf conversion
 import gguf
 import os
 import sys
 import struct
 import json
 import numpy as np
 import torch
 from typing import Any, List
 from pathlib import Path
 from transformers import AutoTokenizer
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
 dir_model = sys.argv[1]
 last_dir = os.path.basename(os.path.normpath(dir_model))
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
 fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
 print("gguf: loading model "+last_dir)
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "GPTNeoXForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.GPTNEOX
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["num_hidden_layers"]
 gguf_writer.add_name(last_dir)
 gguf_writer.add_context_length(hparams["max_position_embeddings"])
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
 gguf_writer.add_head_count(hparams["num_attention_heads"])
 gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: List[str] = []
 merges: List[str] = []
 if Path(dir_model + "/tokenizer.json").is_file():
    # gpt2 tokenizer
    gguf_writer.add_tokenizer_model("gpt2")
    print("gguf: get gpt2 tokenizer merges")
    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
        tokenizer_json = json.load(f)
    merges = tokenizer_json["model"]["merges"]
    gguf_writer.add_token_merges(merges)
    print("gguf: get gpt2 tokenizer vocab")
    vocab_size = len(tokenizer_json["model"]["vocab"])
    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
    tokenizer = AutoTokenizer.from_pretrained(dir_model)
    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
    byte_encoder = bytes_to_unicode()
    byte_decoder = {v: k for k, v in byte_encoder.items()}
    for i in range(vocab_size):
        if i in reverse_vocab:
            try:
                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
            except KeyError:
                text = bytearray()
                for c in reverse_vocab[i]:
                    if ord(c) < 256:  # single byte character
                        text.append(byte_decoder[ord(c)])
                    else:  # multibyte special token character
                        text.extend(c.encode('utf-8'))
        else:
            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
            pad_token = f"[PAD{i}]".encode("utf8")
            text = bytearray(pad_token)
        tokens.append(text)
    gguf_writer.add_token_list(tokens)
    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
        print("gguf: get special token ids")
        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
            tokenizer_config = json.load(f)
        # find special token ids
        if "bos_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["bos_token"]:
                    gguf_writer.add_bos_token_id(key["id"])
        if "eos_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["eos_token"]:
                    gguf_writer.add_eos_token_id(key["id"])
        if "unk_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["unk_token"]:
                    gguf_writer.add_unk_token_id(key["id"])
        if "sep_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["sep_token"]:
                    gguf_writer.add_sep_token_id(key["id"])
        if "pad_token" in tokenizer_config:
            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["pad_token"]:
                    gguf_writer.add_pad_token_id(key["id"])
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = ("pytorch_model.bin",)
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        # we don't need these
        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
            continue
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # map tensor names
        if name.endswith(".weight") and name[:-7] in tensor_map:
            name = tensor_map[name[:-7]] + ".weight"
        elif name.endswith(".bias") and name[:-5] in tensor_map:
            name = tensor_map[name[:-5]] + ".bias"
        else:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 print("gguf: write tensors")
 gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print("gguf: model successfully exported to '" + fname_out + "'")
 print("")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@ -0,0 +1,307 @@
 # 7b pth llama --> gguf conversion
 # Only models with a single datafile are supported, like 7B
 # HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
 import gguf
 import os
 import sys
 import struct
 import json
 import numpy as np
 import torch
 from typing import Any, List
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
 #NDArray = np.ndarray[Any, Any]
 # compatible with python < 3.9
 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
 def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("consolidated."):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
 dir_model = sys.argv[1]
 last_dir = os.path.basename(os.path.normpath(dir_model))
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
 fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
 print("gguf: loading model "+last_dir)
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "LlamaForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 if num_parts > 1:
    print("gguf: Only models with a single datafile are supported.")
    sys.exit()
 ARCH=gguf.MODEL_ARCH.LLAMA
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["num_hidden_layers"]
 head_count = hparams["num_attention_heads"]
 if "num_key_value_heads" in hparams:
    head_count_kv = hparams["num_key_value_heads"]
 else:
    head_count_kv = head_count
 if "_name_or_path" in hparams:
    hf_repo = hparams["_name_or_path"]
 else:
    hf_repo = ""
 if "max_sequence_length" in hparams:
    ctx_length = hparams["max_sequence_length"]
 elif "max_position_embeddings" in hparams:
    ctx_length = hparams["max_position_embeddings"]
 else:
    print("gguf: can not find ctx length parameter.")
    sys.exit()
 gguf_writer.add_name(last_dir)
 gguf_writer.add_source_hf_repo(hf_repo)
 gguf_writer.add_tensor_data_layout("Meta AI original pth")
 gguf_writer.add_context_length(ctx_length)
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
 gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
    if "type" in hparams["rope_scaling"]:
        if hparams["rope_scaling"]["type"] == "linear":
            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: List[bytes] = []
 scores: List[float] = []
 toktypes: List[int] = []
 if Path(dir_model + "/tokenizer.model").is_file():
    # vocab type sentencepiece
    print("gguf: get sentencepiece tokenizer vocab and scores")
    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
    for i in range(tokenizer.vocab_size()):
        text: bytes
        score: float
        piece = tokenizer.id_to_piece(i)
        text = piece.encode("utf-8")
        score = tokenizer.get_score(i)
        toktype = 1  # defualt to normal token type
        if tokenizer.is_unknown(i):
            toktype = 2
        if tokenizer.is_control(i):
            toktype = 3
        # toktype = 4 is user-defined = tokens from added_tokens.json
        if tokenizer.is_unused(i):
            toktype = 5
        if tokenizer.is_byte(i):
            toktype = 6
        tokens.append(text)
        scores.append(score)
        toktypes.append(toktype)
    if Path(dir_model + "/added_tokens.json").is_file():
        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
            addtokens_json = json.load(f)
            print("gguf: get added tokens")
            for key in addtokens_json:
                tokens.append( key.encode("utf-8") )
                scores.append(-1000.0)
                toktypes.append(4) # user-defined token type
    gguf_writer.add_tokenizer_model("llama")
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
 print("gguf: get special token ids")
 if Path(dir_model + "/tokenizer.json").is_file():
    # Look for special tokens in tokenizer.json if it exists
    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
        tokenizer = json.load(f)
    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
            tokenizer_config = json.load(f)
        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["bos_token"]["content"]:
                    gguf_writer.add_bos_token_id(key["id"])
        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["eos_token"]["content"]:
                    gguf_writer.add_eos_token_id(key["id"])
        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["unk_token"]["content"]:
                    gguf_writer.add_unk_token_id(key["id"])
        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["sep_token"]["content"]:
                    gguf_writer.add_sep_token_id(key["id"])
        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["pad_token"]["content"]:
                    gguf_writer.add_pad_token_id(key["id"])
 else:
    # If no tokenizer.json: Look for special tokens in config.json
    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
 for part_name in part_names:
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        # we don't need these
        if name == "rope.freqs":
            continue
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # map tensor names
        if name.endswith(".weight") and name[:-7] in tensor_map:
            name = tensor_map[name[:-7]] + ".weight"
        elif name.endswith(".bias") and name[:-5] in tensor_map:
            name = tensor_map[name[:-5]] + ".bias"
        else:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 print("gguf: write tensors")
 gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print("gguf: model successfully exported to '" + fname_out + "'")
 print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@ -0,0 +1,334 @@
 import sys, struct, math, argparse
 from pathlib import Path
 import numpy as np
 import gguf
 # Note: Does not support GGML_QKK_64
 QK_K = 256
 # Items here are (block size, type size)
 GGML_QUANT_SIZES = {
    gguf.GGMLQuantizationType.F32  : (1, 4),
    gguf.GGMLQuantizationType.F16  : (1, 2),
    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
 }
 class Hyperparameters:
    def __init__(self):
        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
        self.n_ff = 0
    def set_n_ff(self, model):
        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
        ff_tensor = model.tensors[ff_tensor_idx]
        self.n_ff = ff_tensor.dims[1]
    def load(self, data, offset):
        (
            self.n_vocab,
            self.n_embd,
            self.n_mult,
            self.n_head,
            self.n_layer,
            self.n_rot,
            self.ftype,
        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
        return 4 * 7
    def __str__(self):
        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
 class Vocab:
    def __init__(self):
        self.items = []
    def load(self, data, offset, n_vocab):
        orig_offset = offset
        for _ in range(n_vocab):
            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
            assert itemlen < 4096, 'Absurd vocab item length'
            offset += 4
            vocab = bytes(data[offset:offset + itemlen])
            offset += itemlen
            score = struct.unpack('<f', data[offset:offset + 4])[0]
            offset += 4
            self.items.append((vocab, score))
        return offset - orig_offset
 class Tensor:
    def __init__(self):
        self.name = None
        self.dims = ()
        self.dtype = None
        self.start_offset = 0
        self.len_bytes = 0
    def load(self, data, offset):
        orig_offset = offset
        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
        assert name_len < 4096, 'Absurd tensor name length'
        quant = GGML_QUANT_SIZES.get(dtype)
        assert quant is not None, 'Unknown tensor type'
        (blksize, tysize) = quant
        offset += 12
        self.dtype= dtype
        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
        offset += name_len
        pad = ((offset + 31) & ~31) - offset
        offset += pad
        n_elems = np.prod(self.dims)
        n_bytes = (n_elems * tysize) // blksize
        self.start_offset = offset
        self.len_bytes = n_bytes
        offset += n_bytes
        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
        return offset - orig_offset
 class GGMLV3Model:
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
        self.tensor_map = {}
        self.tensors = []
    def validate_header(self, data, offset):
        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
            raise ValueError('Only GGJTv3 supported')
        return 8
    def load(self, data, offset):
        offset += self.validate_header(data, offset)
        hp = Hyperparameters()
        offset += hp.load(data, offset)
        vocab = Vocab()
        offset += vocab.load(data, offset, hp.n_vocab)
        tensors = []
        tensor_map = {}
        while offset < len(data):
            tensor = Tensor()
            offset += tensor.load(data, offset)
            tensor_map[tensor.name] = len(tensors)
            tensors.append(tensor)
        self.hyperparameters = hp
        self.vocab = vocab
        self.tensors = tensors
        self.tensor_map = tensor_map
        hp.set_n_ff(self)
        return offset
 class GGMLToGGUF:
    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
        hp = ggml_model.hyperparameters
        self.model = ggml_model
        self.data = data
        self.cfg = cfg
        self.params_override = params_override
        self.vocab_override = vocab_override
        if params_override is not None:
            n_kv_head = params_override.n_head_kv
        else:
            if cfg.gqa == 1:
                n_kv_head = hp.n_head
            else:
                gqa = float(cfg.gqa)
                n_kv_head = None
                for x in range(1, 256):
                    if float(hp.n_head) / float(x) == gqa:
                        n_kv_head = x
                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
        self.n_kv_head = n_kv_head
        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
    def save(self):
        print('* Preparing to save GGUF file')
        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
        self.add_params(gguf_writer)
        self.add_vocab(gguf_writer)
        self.add_tensors(gguf_writer)
        print("    gguf: write header")
        gguf_writer.write_header_to_file()
        print("    gguf: write metadata")
        gguf_writer.write_kv_data_to_file()
        print("    gguf: write tensors")
        gguf_writer.write_tensors_to_file()
        gguf_writer.close()
    def add_params(self, gguf_writer):
        hp = self.model.hyperparameters
        cfg = self.cfg
        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
        try:
            # Filenames aren't necessarily valid UTF8.
            name = cfg.name if cfg.name is not None else cfg.input.name
        except UnicodeDecodeError:
            name = None
        print('* Adding model parameters and KV items')
        if name is not None:
            gguf_writer.add_name(name)
        gguf_writer.add_description(desc)
        if self.params_override is not None:
            po = self.params_override
            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
            gguf_writer.add_context_length      (po.n_ctx)
            gguf_writer.add_embedding_length    (po.n_embd)
            gguf_writer.add_block_count         (po.n_layer)
            gguf_writer.add_feed_forward_length (po.n_ff)
            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
            gguf_writer.add_head_count          (po.n_head)
            gguf_writer.add_head_count_kv       (po.n_head_kv)
            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
            return
        gguf_writer.add_context_length(cfg.context_length)
        gguf_writer.add_embedding_length(hp.n_embd)
        gguf_writer.add_block_count(hp.n_layer)
        gguf_writer.add_feed_forward_length(hp.n_ff)
        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
        gguf_writer.add_head_count(hp.n_head)
        gguf_writer.add_head_count_kv(self.n_kv_head)
        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
    def add_vocab(self, gguf_writer):
        hp = self.model.hyperparameters
        gguf_writer.add_tokenizer_model('llama')
        tokens = []
        scores = []
        toktypes = []
        if self.vocab_override is not None:
            vo = self.vocab_override
            print('* Adding vocab item(s)')
            for (idx, vitem) in enumerate(vo.all_tokens()):
                if len(vitem) == 3:
                    tokens.append(vitem[0])
                    scores.append(vitem[1])
                    toktypes.append(vitem[2])
                else:
                    # Maybe try to guess the token type here?
                    tokens.append(vitem[0])
                    scores.append(vitem[1])
            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
            gguf_writer.add_token_list(tokens)
            gguf_writer.add_token_scores(scores)
            if len(toktypes) > 0:
                gguf_writer.add_token_types(toktypes)
            return
        print(f'* Adding {hp.n_vocab} vocab item(s)')
        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
            tt = 1 # Normal
            if len(vbytes) == 0:
                tt = 3 # Control
            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
                hv = hex(vbytes[0])[2:].upper()
                vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
                tt = 6 # Byte
            else:
                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
            toktypes.append(tt)
            tokens.append(vbytes)
            scores.append(vscore)
        gguf_writer.add_token_list(tokens)
        gguf_writer.add_token_scores(scores)
        gguf_writer.add_token_types(toktypes)
    def add_tensors(self, gguf_writer):
        nm = self.name_map
        data = self.data
        print(f'* Adding {len(self.model.tensors)} tensor(s)')
        for tensor in self.model.tensors:
            name = str(tensor.name, 'UTF-8')
            if name.endswith('.weight'):
                name = name[:-7]
                suffix = '.weight'
            elif name.endswith('.bias'):
                name = name[:-5]
                suffix = '.bias'
            mapped_name = nm.get(name)
            assert mapped_name is not None, f'Bad name {name}'
            mapped_name += suffix
            tempdims = list(tensor.dims[:])
            if len(tempdims) > 1:
                temp = tempdims[1]
                tempdims[1] = tempdims[0]
                tempdims[0] = temp
            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
 def handle_metadata(cfg, hp):
    import convert
    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
    hf_config_path   = cfg.model_metadata_dir / "config.json"
    orig_config_path = cfg.model_metadata_dir / "params.json"
    # We pass a fake model here. "original" mode will check the shapes of some
    # tensors if information is missing in the .json file: other than that, the
    # model data isn't used so this should be safe (at least for now).
    fakemodel = {
        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
    }
    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
    if hf_config_path.exists():
        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
    elif orig_config_path.exists():
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
    convert.check_vocab_size(params, vocab)
    return (params, vocab)
 def handle_args():
    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
    parser.add_argument('--name', help = 'Set model name')
    parser.add_argument('--desc', help = 'Set model description')
    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
    return parser.parse_args()
 def main():
    cfg = handle_args()
    print(f'* Using config: {cfg}')
    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
    data = np.memmap(cfg.input, mode = 'r')
    model = GGMLV3Model()
    print('* Scanning GGML input file')
    offset = model.load(data, 0)
    print(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
    if cfg.model_metadata_dir is not None:
        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
        print(f'* Overriding params: {params_override}')
        print(f'* Overriding vocab: {vocab_override}')
    else:
        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
    converter.save()
    print(f'* Successful completion. Output saved to: {cfg.output}')
 main()
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@ -0,0 +1,327 @@
 # HF llama --> gguf conversion
 import gguf
 import os
 import sys
 import struct
 import json
 import numpy as np
 import torch
 from typing import Any, List, Optional
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
 #NDArray = np.ndarray[Any, Any]
 # compatible with python < 3.9
 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
 # reverse HF permute back to original pth layout
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
 def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
    if n_kv_head is not None and n_head != n_kv_head:
        n_head //= n_kv_head
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
            .swapaxes(1, 2)
            .reshape(weights.shape))
 def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
 dir_model = sys.argv[1]
 last_dir = os.path.basename(os.path.normpath(dir_model))
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
 fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
 print("gguf: loading model "+last_dir)
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "LlamaForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.LLAMA
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["num_hidden_layers"]
 head_count = hparams["num_attention_heads"]
 if "num_key_value_heads" in hparams:
    head_count_kv = hparams["num_key_value_heads"]
 else:
    head_count_kv = head_count
 if "_name_or_path" in hparams:
    hf_repo = hparams["_name_or_path"]
 else:
    hf_repo = ""
 if "max_sequence_length" in hparams:
    ctx_length = hparams["max_sequence_length"]
 elif "max_position_embeddings" in hparams:
    ctx_length = hparams["max_position_embeddings"]
 else:
    print("gguf: can not find ctx length parameter.")
    sys.exit()
 gguf_writer.add_name(last_dir)
 gguf_writer.add_source_hf_repo(hf_repo)
 gguf_writer.add_tensor_data_layout("Meta AI original pth")
 gguf_writer.add_context_length(ctx_length)
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
 gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
    if "type" in hparams["rope_scaling"]:
        if hparams["rope_scaling"]["type"] == "linear":
            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: List[bytes] = []
 scores: List[float] = []
 toktypes: List[int] = []
 if Path(dir_model + "/tokenizer.model").is_file():
    # vocab type sentencepiece
    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
    for i in range(tokenizer.vocab_size()):
        text: bytes
        score: float
        piece = tokenizer.id_to_piece(i)
        text = piece.encode("utf-8")
        score = tokenizer.get_score(i)
        toktype = 1  # defualt to normal token type
        if tokenizer.is_unknown(i):
            toktype = 2
        if tokenizer.is_control(i):
            toktype = 3
        # toktype = 4 is user-defined = tokens from added_tokens.json
        if tokenizer.is_unused(i):
            toktype = 5
        if tokenizer.is_byte(i):
            toktype = 6
        tokens.append(text)
        scores.append(score)
        toktypes.append(toktype)
    if Path(dir_model + "/added_tokens.json").is_file():
        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
            addtokens_json = json.load(f)
            print("gguf: get added tokens")
            for key in addtokens_json:
                tokens.append( key.encode("utf-8") )
                scores.append(-1000.0)
                toktypes.append(4) # user-defined token type
    gguf_writer.add_tokenizer_model("llama")
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
 print("gguf: get special token ids")
 if Path(dir_model + "/tokenizer.json").is_file():
    # Look for special tokens in tokenizer.json if it exists
    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
        tokenizer = json.load(f)
    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
            tokenizer_config = json.load(f)
        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["bos_token"]["content"]:
                    gguf_writer.add_bos_token_id(key["id"])
        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["eos_token"]["content"]:
                    gguf_writer.add_eos_token_id(key["id"])
        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["unk_token"]["content"]:
                    gguf_writer.add_unk_token_id(key["id"])
        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["sep_token"]["content"]:
                    gguf_writer.add_sep_token_id(key["id"])
        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["pad_token"]["content"]:
                    gguf_writer.add_pad_token_id(key["id"])
 else:
    # If no tokenizer.json: Look for special tokens in config.json
    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = ("pytorch_model.bin",)
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        # we don't need these
        if name.endswith(".rotary_emb.inv_freq"):
            continue
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # reverse permute these
        if name.endswith(".q_proj.weight"):
            data = reverse_hf_permute(data, head_count)
        if name.endswith(".k_proj.weight"):
            data = reverse_hf_permute(data, head_count, head_count_kv)
        # map tensor names
        if name.endswith(".weight") and name[:-7] in tensor_map:
            name = tensor_map[name[:-7]] + ".weight"
        elif name.endswith(".bias") and name[:-5] in tensor_map:
            name = tensor_map[name[:-5]] + ".bias"
        else:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 print("gguf: write tensors")
 gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print("gguf: model successfully exported to '" + fname_out + "'")
 print("")
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -1,13 +0,0 @@
 # Compatibility stub
 import argparse
 import convert
 parser = argparse.ArgumentParser(
    description="""[DEPRECATED - use `convert.py` instead]
    Convert a LLaMA model checkpoint to a ggml compatible file""")
 parser.add_argument('dir_model',  help='directory containing the model checkpoint')
 parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
 args = parser.parse_args()
 convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
--- a/convert.py
+++ b/convert.py
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with cuBLAS
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
 CPU: 7 physical cores
 RAM: 32GB
-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
 Result:
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
 # ...
 # common
 set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
    console.h
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
    )
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
 target_link_libraries(${TARGET} PRIVATE llama)
 # examples
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@ -45,6 +24,7 @@ else()
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
 #include <cassert>
@ -138,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
    using ttype = llama_token_type;
-    struct token_score {
+    struct token_data {
-        token tok;
+        token text;
        float score;
        ttype type;
    };
    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };
 struct my_llama_hparams {
@ -502,7 +505,7 @@ bool is_ggml_file(const char *filename) {
        return false;
    }
    uint32_t magic = file.read_u32();
-    return magic == LLAMA_FILE_MAGIC;
+    return magic == GGUF_MAGIC;
 }
 void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
@ -515,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
-        std::vector<const char *> strings;
+        const int n_vocab = llama_n_vocab(lctx);
        std::vector<float> scores;
        int n_vocab = llama_n_vocab(lctx);
        strings.resize(n_vocab, NULL);
        scores.resize(n_vocab, 0);
        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
        vocab->id_to_token.resize(n_vocab);
        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
+            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
-            float       score = scores[i];
+            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
-            vocab->id_to_token[i].tok   = tok;
+            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
-            vocab->id_to_token[i].score = score;
+            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
            vocab->token_to_id.emplace(tok, i);
        }
        llama_free(lctx);
        llama_free_model(lmodel);
    } else { // assume llama2.c vocabulary
        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
        llama_file file(filename, "rb");
-        uint32_t n_vocab = config->vocab_size;
+        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
-        for (uint32_t i=0; i<n_vocab; ++i) {
+        for (int i=0; i<n_vocab; ++i) {
            float_t score = file.read_f32();
            uint32_t len = file.read_u32();
-            std::string tok = file.read_string(len);
+            std::string text = file.read_string(len);
-            vocab->id_to_token[i].tok = tok;
+            vocab->id_to_token[i].text = text;
            vocab->id_to_token[i].score = score;
-            vocab->token_to_id.emplace(tok, i);
+            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
            vocab->token_to_id.emplace(text, i);
        }
    }
 }
@ -590,75 +587,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
    if (file.fp == NULL) {
        return;
    }
    // write_magic
    file.write_u32(LLAMA_FILE_MAGIC);   // magic
    file.write_u32(LLAMA_FILE_VERSION); // version
    // write_hparams
    file.write_u32(model->hparams.n_vocab);
    file.write_u32(model->hparams.n_embd);
    file.write_u32(model->hparams.n_mult);
    file.write_u32(model->hparams.n_head);
    file.write_u32(model->hparams.n_layer);
    file.write_u32(model->hparams.n_rot);
    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+#pragma message("TODO: implement file saving using gguf")
-    uint32_t n_vocab = model->hparams.n_vocab;
+    (void) vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
+    (void) model;
-        const auto & token_score = vocab->id_to_token.at(i);
+    (void) w;
-        file.write_u32((uint32_t) token_score.tok.size());
+//    // write_magic
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-        file.write_raw(&token_score.score, sizeof(token_score.score));
+//    file.write_u32(LLAMA_FILE_VERSION); // version
-    }
+//    // write_hparams
-
+//    file.write_u32(model->hparams.n_vocab);
-    // stuff AK weights into GG weights one by one.
+//    file.write_u32(model->hparams.n_embd);
-    // w->token_embedding_table -> model->tok_embeddings
+//    file.write_u32(model->hparams.n_mult);
-    // float*                   -> struct ggml_tensor
+//    file.write_u32(model->hparams.n_head);
-    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+//    file.write_u32(model->hparams.n_layer);
-    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+//    file.write_u32(model->hparams.n_rot);
-
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+//
-    //print_row(model->norm, 0);
+//    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
-
+//    uint32_t n_vocab = model->hparams.n_vocab;
-    // for rms-att-weight
+//    for (uint32_t i = 0; i < n_vocab; i++) {
-    int row_length = model->hparams.n_embd;
+//        const auto & token_data = vocab->id_to_token.at(i);
-    const auto & hparams = model->hparams;
+//        file.write_u32((uint32_t) token_data.tok.size());
-    //int n_ff = model->hparams.n_embd;
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
-    int n_ff = get_n_ff(&hparams);
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
-
+//    }
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+//
-        auto & layer = model->layers[i];
+//    // stuff AK weights into GG weights one by one.
-        // 1d
+//    // w->token_embedding_table -> model->tok_embeddings
-        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+//    // float*                   -> struct ggml_tensor
-        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+//    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
-
+//    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+//
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+//    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+//    //print_row(model->norm, 0);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+//
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+//    // for rms-att-weight
-
+//    int row_length = model->hparams.n_embd;
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+//    const auto & hparams = model->hparams;
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+//    //int n_ff = model->hparams.n_embd;
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+//    int n_ff = get_n_ff(&hparams);
-    }
+//
-    // write tensors
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
-    write_tensor(&file, model->tok_embeddings);
+//        auto & layer = model->layers[i];
-    write_tensor(&file, model->norm);
+//        // 1d
-    write_tensor(&file, model->output); // ?
+//        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-        auto & layer = model->layers[i];
+//
-
+//        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        write_tensor(&file, layer.attention_norm);
+//        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        write_tensor(&file, layer.wq);
+//        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        write_tensor(&file, layer.wk);
+//        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-        write_tensor(&file, layer.wv);
+//        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        write_tensor(&file, layer.wo);
+//
-        write_tensor(&file, layer.ffn_norm);
+//        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        write_tensor(&file, layer.w1);
+//        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        write_tensor(&file, layer.w2);
+//        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
-        write_tensor(&file, layer.w3);
+//    }
-    }
+//    // write tensors
 //    write_tensor(&file, model->tok_embeddings);
 //    write_tensor(&file, model->norm);
 //    write_tensor(&file, model->output); // ?
 //    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
 //        auto & layer = model->layers[i];
 //
 //        write_tensor(&file, layer.attention_norm);
 //        write_tensor(&file, layer.wq);
 //        write_tensor(&file, layer.wk);
 //        write_tensor(&file, layer.wv);
 //        write_tensor(&file, layer.wo);
 //        write_tensor(&file, layer.ffn_norm);
 //        write_tensor(&file, layer.w1);
 //        write_tensor(&file, layer.w2);
 //        write_tensor(&file, layer.w3);
 //    }
 }
 struct train_params get_default_train_params() {
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl()];
+        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
-        //     logits[llama_token_nl()] = nl_logit;
+        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }
        if (temp <= 0) {
@ -211,7 +211,7 @@ const char * sampling(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    int id = sampling_id(mymodel);
    static std::string ret;
-    if (id == llama_token_eos()) {
+    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_str(ctx, id);
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -0,0 +1,246 @@
 #include "ggml.h"
 #include "llama.h"
 #include <cstdio>
 #include <cinttypes>
 #include <string>
 #include <sstream>
 #include <fstream>
 #include <vector>
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 template<typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;
    ss << val;
    return ss.str();
 }
 bool gguf_ex_write(const std::string & fname) {
    struct gguf_context * ctx = gguf_init_empty();
    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
    struct ggml_init_params params = {
        /*.mem_size   =*/ 128ull*1024ull*1024ull,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false,
    };
    struct ggml_context * ctx_data = ggml_init(params);
    const int n_tensors = 10;
    // tensor infos
    for (int i = 0; i < n_tensors; ++i) {
        const std::string name = "tensor_" + to_string(i);
        int64_t ne[GGML_MAX_DIMS] = { 1 };
        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
        for (int j = 0; j < n_dims; ++j) {
            ne[j] = rand() % 10 + 1;
        }
        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
        ggml_set_name(cur, name.c_str());
        {
            float * data = (float *) cur->data;
            for (int j = 0; j < ggml_nelements(cur); ++j) {
                data[j] = 100 + i;
            }
        }
        gguf_add_tensor(ctx, cur);
    }
    gguf_write_to_file(ctx, fname.c_str(), false);
    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
 // just read tensor info
 bool gguf_ex_read_0(const std::string & fname) {
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ NULL,
    };
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);
        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);
            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    // find kv string
    {
        const char * findkey = "some.parameter.string";
        const int keyidx = gguf_find_key(ctx, findkey);
        if (keyidx == -1) {
            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
        } else {
            const char * key_value = gguf_get_val_str(ctx, keyidx);
            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
        }
    }
    // tensor info
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    gguf_free(ctx);
    return true;
 }
 // read and create ggml_context containing the tensors and their data
 bool gguf_ex_read_1(const std::string & fname) {
    struct ggml_context * ctx_data = NULL;
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ &ctx_data,
    };
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);
        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);
            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    // tensor info
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    // data
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        for (int i = 0; i < n_tensors; ++i) {
            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
            // print first 10 elements
            const float * data = (const float *) cur->data;
            printf("%s data[:10] : ", name);
            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
                printf("%f ", data[j]);
            }
            printf("\n\n");
            // check data
            {
                const float * data = (const float *) cur->data;
                for (int j = 0; j < ggml_nelements(cur); ++j) {
                    if (data[j] != 100 + i) {
                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
                        return false;
                    }
                }
            }
        }
    }
    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc < 3) {
        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
        return -1;
    }
    const std::string fname(argv[1]);
    const std::string mode (argv[2]);
    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
    if (mode == "w") {
        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
    } else if (mode == "r") {
        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
    }
    return 0;
 }
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@ -0,0 +1,8 @@
 set(TARGET llama-bench)
 add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -0,0 +1,969 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
 #include <chrono>
 #include <cinttypes>
 #include <cstring>
 #include <ctime>
 #include <iterator>
 #include <map>
 #include <numeric>
 #include <regex>
 #include <sstream>
 #include <stdio.h>
 #include <string>
 #include <vector>
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
 #include "build-info.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #endif
 // utils
 static uint64_t get_time_ns() {
    using clock = std::chrono::high_resolution_clock;
    return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }
 template<class T>
 static std::string join(const std::vector<T> & values, const std::string & delim) {
    std::ostringstream str;
    for (size_t i = 0; i < values.size(); i++) {
        str << values[i];
        if (i < values.size() - 1) {
            str << delim;
        }
    }
    return str.str();
 }
 template<class T>
 static std::vector<T> split(const std::string & str, char delim) {
    std::vector<T> values;
    std::istringstream str_stream(str);
    std::string token;
    while (std::getline(str_stream, token, delim)) {
        T value;
        std::istringstream token_stream(token);
        token_stream >> value;
        values.push_back(value);
    }
    return values;
 }
 template<typename T>
 static T avg(const std::vector<T> & v) {
    if (v.empty()) {
        return 0;
    }
    T sum = std::accumulate(v.begin(), v.end(), T(0));
    return sum / (T)v.size();
 }
 template<typename T>
 static T stdev(const std::vector<T> & v) {
    if (v.size() <= 1) {
        return 0;
    }
    T mean = avg(v);
    T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
    T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
    return stdev;
 }
 static bool ggml_cpu_has_metal() {
 #if defined(GGML_USE_METAL)
    return true;
 #else
    return false;
 #endif
 }
 static std::string get_cpu_info() {
    std::string id;
 #ifdef __linux__
    FILE * f = fopen("/proc/cpuinfo", "r");
    if (f) {
        char buf[1024];
        while (fgets(buf, sizeof(buf), f)) {
            if (strncmp(buf, "model name", 10) == 0) {
                char * p = strchr(buf, ':');
                if (p) {
                    p++;
                    while (std::isspace(*p)) {
                        p++;
                    }
                    while (std::isspace(p[strlen(p) - 1])) {
                        p[strlen(p) - 1] = '\0';
                    }
                    id = p;
                    break;
                }
            }
        }
    }
 #endif
    // TODO: other platforms
    return id;
 }
 static std::string get_gpu_info() {
    std::string id;
 #ifdef GGML_USE_CUBLAS
    int count = ggml_cuda_get_device_count();
    for (int i = 0; i < count; i++) {
        char buf[128];
        ggml_cuda_get_device_description(i, buf, sizeof(buf));
        id += buf;
        if (i < count - 1) {
            id += "/";
        }
    }
 #endif
    // TODO: other backends
    return id;
 }
 // command line params
 enum output_formats {CSV, JSON, MARKDOWN, SQL};
 struct cmd_params {
    std::vector<std::string> model;
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
    std::vector<int> n_batch;
    std::vector<bool> f32_kv;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
    std::vector<bool> mul_mat_q;
    std::vector<bool> low_vram;
    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
    int reps;
    bool verbose;
    output_formats output_format;
 };
 static const cmd_params cmd_params_defaults = {
    /* model         */ {"models/7B/ggml-model-q4_0.bin"},
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
    /* f32_kv        */ {false},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
    /* mul_mat_q     */ {true},
    /* low_vram      */ {false},
    /* tensor_split  */ {{}},
    /* reps          */ 5,
    /* verbose       */ false,
    /* output_format */ MARKDOWN
 };
 static void print_usage(int /* argc */, char ** argv) {
    fprintf(stdout, "usage: %s [options]\n", argv[0]);
    fprintf(stdout, "\n");
    fprintf(stdout, "options:\n");
    fprintf(stdout, "  -h, --help\n");
    fprintf(stdout, "  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
    fprintf(stdout, "  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    fprintf(stdout, "  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
    fprintf(stdout, "  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
    fprintf(stdout, "  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
    fprintf(stdout, "  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    fprintf(stdout, "  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    fprintf(stdout, "  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
    fprintf(stdout, "  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
    fprintf(stdout, "  -ts, --tensor_split <ts>                       \n");
    fprintf(stdout, "  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
    fprintf(stdout, "  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : "md");
    fprintf(stdout, "  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    fprintf(stdout, "\n");
    fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by repeating the parameter.\n");
 }
 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
    std::string arg;
    bool invalid_param = false;
    const std::string arg_prefix = "--";
    const char split_delim = ',';
    params.verbose = cmd_params_defaults.verbose;
    params.output_format = cmd_params_defaults.output_format;
    params.reps = cmd_params_defaults.reps;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
        if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv);
            exit(0);
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<std::string>(argv[i], split_delim);
            params.model.insert(params.model.end(), p.begin(), p.end());
        } else if (arg == "-p" || arg == "--n-prompt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
        } else if (arg == "-n" || arg == "--n-gen") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
        } else if (arg == "--memory-f32") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<int>(argv[i], split_delim);
            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
        } else if (arg == "-mg" || arg == "--main-gpu") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.main_gpu = split<int>(argv[i], split_delim);
        } else if (arg == "-lv" || arg == "--low-vram") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<bool>(argv[i], split_delim);
            params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<bool>(argv[i], split_delim);
            params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
        } else if (arg == "-ts" || arg == "--tensor-split") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            for (auto ts : split<std::string>(argv[i], split_delim)) {
                // split string by ; and /
                const std::regex regex{R"([;/]+)"};
                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
                std::vector<std::string> split_arg{it, {}};
                GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
                std::array<float, LLAMA_MAX_DEVICES> tensor_split;
                for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
                    if (i < split_arg.size()) {
                        tensor_split[i] = std::stof(split_arg[i]);
                    } else {
                        tensor_split[i] = 0.0f;
                    }
                }
                params.tensor_split.push_back(tensor_split);
            }
        } else if (arg == "-r" || arg == "--repetitions") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.reps = std::stoi(argv[i]);
        } else if (arg == "-o" || arg == "--output") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            if (argv[i] == std::string("csv")) {
                params.output_format = CSV;
            } else if (argv[i] == std::string("json")) {
                params.output_format = JSON;
            } else if (argv[i] == std::string("md")) {
                params.output_format = MARKDOWN;
            } else if (argv[i] == std::string("sql")) {
                params.output_format = SQL;
            } else {
                invalid_param = true;
                break;
            }
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else {
            invalid_param = true;
            break;
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
        print_usage(argc, argv);
        exit(1);
    }
    // set defaults
    if (params.model.empty())        { params.model = cmd_params_defaults.model; }
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
    if (params.low_vram.empty())     { params.low_vram = cmd_params_defaults.low_vram; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
    return params;
 }
 struct cmd_params_instance {
    std::string model;
    int n_prompt;
    int n_gen;
    int n_batch;
    bool f32_kv;
    int n_threads;
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
    bool low_vram;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
    llama_context_params to_llama_params() const {
        llama_context_params lparams = llama_context_default_params();
        lparams.n_ctx = n_prompt + n_gen;
        lparams.n_batch = n_batch;
        lparams.f16_kv = !f32_kv;
        lparams.n_gpu_layers = n_gpu_layers;
        lparams.main_gpu = main_gpu;
        lparams.mul_mat_q = mul_mat_q;
        lparams.low_vram = low_vram;
        lparams.tensor_split = tensor_split.data();
        return lparams;
    }
 };
 static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
    std::vector<cmd_params_instance> instances;
    for (const auto & m : params.model)
    for (const auto & nb : params.n_batch)
    for (const auto & fk : params.f32_kv)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & mg : params.main_gpu)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & lv : params.low_vram)
    for (const auto & ts : params.tensor_split)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
            /* .model        = */ m,
            /* .n_prompt     = */ n_prompt,
            /* .n_gen        = */ n_gen,
            /* .n_batch      = */ nb,
            /* .f32_kv       = */ fk,
            /* .n_threads    = */ nt,
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
            /* .mul_mat_q    = */ mmq,
            /* .low_vram     = */ lv,
            /* .tensor_split = */ ts,
        };
        instances.push_back(instance);
    }
    return instances;
 }
 static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
    std::vector<cmd_params_instance> instances;
    for (const auto & n_prompt : params.n_prompt) {
        if (n_prompt == 0) {
            continue;
        }
        auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
        instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
    }
    for (const auto & n_gen : params.n_gen) {
        if (n_gen == 0) {
            continue;
        }
        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
    }
    return instances;
 }
 struct test {
    static const std::string build_commit;
    static const int build_number;
    static const bool cuda;
    static const bool opencl;
    static const bool metal;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
    static const std::string gpu_info;
    std::string model_filename;
    std::string model_type;
    int n_batch;
    int n_threads;
    bool f32_kv;
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
    bool low_vram;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
    int n_prompt;
    int n_gen;
    std::string test_time;
    std::vector<uint64_t> samples_ns;
    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
        model_filename = inst.model;
        char buf[128];
        llama_model_type(lmodel, buf, sizeof(buf));
        model_type = buf;
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
        f32_kv = inst.f32_kv;
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
        mul_mat_q = inst.mul_mat_q;
        low_vram = inst.low_vram;
        tensor_split = inst.tensor_split;
        n_prompt = inst.n_prompt;
        n_gen = inst.n_gen;
        // RFC 3339 date-time format
        time_t t = time(NULL);
        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
        test_time = buf;
        (void) ctx;
    }
    uint64_t avg_ns() const {
        return ::avg(samples_ns);
    }
    uint64_t stdev_ns() const {
        return ::stdev(samples_ns);
    }
    std::vector<double> get_ts() const {
        int n_tokens = n_prompt + n_gen;
        std::vector<double> ts;
        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
        return ts;
    }
    double avg_ts() const {
        return ::avg(get_ts());
    }
    double stdev_ts() const {
        return ::stdev(get_ts());
    }
    static std::string get_backend() {
        if (cuda) {
            return "CUDA";
        }
        if (opencl) {
            return "OpenCL";
        }
        if (metal) {
            return "Metal";
        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
        if (blas) {
            return "BLAS";
        }
        return "CPU";
    }
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type",
            "n_batch", "n_threads", "f16_kv",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
        };
        return fields;
    }
    enum field_type {STRING, BOOL, INT, FLOAT};
    static field_type get_field_type(const std::string & field) {
        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
            field == "n_prompt" || field == "n_gen" ||
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
            field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
            return FLOAT;
        }
        return STRING;
    }
    std::vector<std::string> get_values() const {
        std::string tensor_split_str;
        int max_nonzero = 0;
        for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
            if (tensor_split[i] > 0) {
                max_nonzero = i;
            }
        }
        for (int i = 0; i <= max_nonzero; i++) {
            char buf[32];
            snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
            tensor_split_str += buf;
            if (i < max_nonzero) {
                tensor_split_str += "/";
            }
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type,
            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
        };
        return values;
    }
    std::map<std::string, std::string> get_map() const {
        std::map<std::string, std::string> map;
        auto fields = get_fields();
        auto values = get_values();
        std::transform(fields.begin(), fields.end(), values.begin(),
                std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
        return map;
    }
 };
 const std::string test::build_commit = BUILD_COMMIT;
 const int         test::build_number = BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
 struct printer {
    virtual ~printer() {}
    FILE * fout;
    virtual void print_header(const cmd_params & params) { (void) params; };
    virtual void print_test(const test & t) = 0;
    virtual void print_footer() { };
 };
 struct csv_printer : public printer {
    static std::string escape_csv(const std::string & field) {
        std::string escaped = "\"";
        for (auto c : field) {
            if (c == '"') {
                escaped += "\"";
            }
            escaped += c;
        }
        escaped += "\"";
        return escaped;
    }
    void print_header(const cmd_params & params) override  {
        std::vector<std::string> fields = test::get_fields();
        fprintf(fout, "%s\n", join(fields, ",").c_str());
        (void) params;
    }
    void print_test(const test & t) override {
        std::vector<std::string> values = t.get_values();
        std::transform(values.begin(), values.end(), values.begin(), escape_csv);
        fprintf(fout, "%s\n", join(values, ",").c_str());
    }
 };
 struct json_printer : public printer {
    bool first = true;
    static std::string escape_json(const std::string & value) {
        std::string escaped;
        for (auto c : value) {
            if (c == '"') {
                escaped += "\\\"";
            } else if (c == '\\') {
                escaped += "\\\\";
            } else  if (c <= 0x1f) {
                char buf[8];
                snprintf(buf, sizeof(buf), "\\u%04x", c);
                escaped += buf;
            } else {
                escaped += c;
            }
        }
        return escaped;
    }
    static std::string format_value(const std::string & field, const std::string & value) {
        switch (test::get_field_type(field)) {
            case test::STRING:
                return "\"" + escape_json(value) + "\"";
            case test::BOOL:
                return value == "0" ? "false" : "true";
            default:
                return value;
        }
    }
    void print_header(const cmd_params & params) override {
        fprintf(fout, "[\n");
        (void) params;
    }
    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
        assert(fields.size() == values.size());
        for (size_t i = 0; i < fields.size(); i++) {
            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
        }
    }
    void print_test(const test & t) override {
        if (first) {
            first = false;
        } else {
            fprintf(fout, ",\n");
        }
        fprintf(fout, "  {\n");
        print_fields(test::get_fields(), t.get_values());
        fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
        fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
        fprintf(fout, "  }");
        fflush(fout);
    }
    void print_footer() override {
        fprintf(fout, "\n]\n");
    }
 };
 struct markdown_printer : public printer {
    std::vector<std::string> fields;
    static int get_field_width(const std::string & field) {
        if (field == "model") {
            return -30;
        }
        if (field == "t/s") {
            return 15;
        }
        int width = std::max((int)field.length(), 10);
        if (test::get_field_type(field) == test::STRING) {
            return -width;
        }
        return width;
    }
    void print_header(const cmd_params & params) override {
        // select fields to print
        fields = { "model", "backend" };
        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
        if (!is_cpu_backend) {
            fields.push_back("n_gpu_layers");
        }
        if (params.n_batch.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
            fields.push_back("n_threads");
        }
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.push_back("n_batch");
        }
        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
            fields.push_back("f16_kv");
        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
            fields.push_back("main_gpu");
        }
        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
            fields.push_back("mul_mat_q");
        }
        if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
            fields.push_back("low_vram");
        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.push_back("tensor_split");
        }
        fields.push_back("test");
        fields.push_back("t/s");
        fprintf(fout, "|");
        for (const auto & field : fields) {
            fprintf(fout, " %*s |", get_field_width(field), field.c_str());
        }
        fprintf(fout, "\n");
        fprintf(fout, "|");
        for (const auto & field : fields) {
            int width = get_field_width(field);
            fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
        }
        fprintf(fout, "\n");
    }
    void print_test(const test & t) override {
        std::map<std::string, std::string> vmap = t.get_map();
        fprintf(fout, "|");
        for (const auto & field : fields) {
            std::string value;
            if (field == "model") {
                value = t.model_type;
            } else if (field == "backend") {
                value = test::get_backend();
            } else if (field == "test") {
                char buf[128];
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
                } else if (t.n_gen > 0 && t.n_prompt == 0) {
                    snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
                } else {
                    assert(false);
                    exit(1);
                }
                value = buf;
            } else if (field == "t/s") {
                char buf[128];
                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                value = buf;
            } else if (vmap.find(field) != vmap.end()) {
                value = vmap.at(field);
            } else {
                assert(false);
                exit(1);
            }
            int width = get_field_width(field);
            if (field == "t/s") {
                // HACK: the utf-8 character is 2 bytes
                width += 1;
            }
            fprintf(fout, " %*s |", width, value.c_str());
        }
        fprintf(fout, "\n");
    }
    void print_footer() override {
        fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
    }
 };
 struct sql_printer : public printer {
    static std::string get_sql_field_type(const std::string & field) {
        switch (test::get_field_type(field)) {
            case test::STRING:
                return "TEXT";
            case test::BOOL:
            case test::INT:
                return "INTEGER";
            case test::FLOAT:
                return "REAL";
            default:
                assert(false);
                exit(1);
        }
    }
    void print_header(const cmd_params & params) override {
        std::vector<std::string> fields = test::get_fields();
        fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
        for (size_t i = 0; i < fields.size(); i++) {
            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),  i < fields.size() - 1 ? "," : "");
        }
        fprintf(fout, ");\n");
        fprintf(fout, "\n");
        (void) params;
    }
    void print_test(const test & t) override {
        fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
        fprintf(fout, "VALUES (");
        std::vector<std::string> values = t.get_values();
        for (size_t i = 0; i < values.size(); i++) {
            fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
        }
        fprintf(fout, ");\n");
    }
 };
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
    int n_processed = 0;
    while (n_processed < n_prompt) {
        int n_tokens = std::min(n_prompt - n_processed, n_batch);
        llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
        n_processed += n_tokens;
    }
 }
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
    llama_token token = llama_token_bos(ctx);
    for (int i = 0; i < n_gen; i++) {
        llama_eval(ctx, &token, 1, n_past + i, n_threads);
    }
 }
 static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) text;
    (void) user_data;
 }
 int main(int argc, char ** argv) {
 #if !defined(NDEBUG)
    fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
 #endif
 #if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
    fprintf(stderr, "warning: debug build, performance may be affected\n");
 #endif
 #if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
    fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
 #endif
    cmd_params params = parse_cmd_params(argc, argv);
    // initialize llama.cpp
    if (!params.verbose) {
        llama_log_set(llama_null_log_callback, NULL);
    }
    bool numa = false;
    llama_backend_init(numa);
    // initialize printer
    std::unique_ptr<printer> p;
    switch (params.output_format) {
        case CSV:
            p.reset(new csv_printer());
            break;
        case JSON:
            p.reset(new json_printer());
            break;
        case MARKDOWN:
            p.reset(new markdown_printer());
            break;
        case SQL:
            p.reset(new sql_printer());
            break;
        default:
            assert(false);
            exit(1);
    }
    p->fout = stdout;
    p->print_header(params);
    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
    for (const auto & inst : params_instances) {
        // TODO: keep the model between tests when possible
        llama_context_params lparams = inst.to_llama_params();
        llama_model * lmodel  = llama_load_model_from_file(inst.model.c_str(), lparams);
        if (lmodel == NULL) {
            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
            return 1;
        }
        llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
            llama_free_model(lmodel);
            return 1;
        }
        test t(inst, lmodel, ctx);
        // warmup run
        test_gen(ctx, 1, 0, t.n_threads);
        for (int i = 0; i < params.reps; i++) {
            uint64_t t_start = get_time_ns();
            if (t.n_prompt > 0) {
                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
            }
            if (t.n_gen > 0) {
                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
            }
            uint64_t t_ns = get_time_ns() - t_start;
            t.samples_ns.push_back(t_ns);
        }
        p->print_test(t);
        llama_print_timings(ctx);
        llama_free(ctx);
        llama_free_model(lmodel);
    }
    p->print_footer();
    llama_backend_free();
    return 0;
 }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
        {
            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
        }
@ -191,10 +191,6 @@ int main(int argc, char ** argv) {
    // tokenize the prompt
    std::vector<llama_token> embd_inp;
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
    } else {
@ -270,15 +266,12 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }
    // determine newline token
    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }
        if (ctx_guidance) {
@ -286,14 +279,14 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
            }
        }
        if (params.n_keep > 0) {
        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
            }
            fprintf(stderr, "'\n");
        }
@ -311,7 +304,7 @@ int main(int argc, char ** argv) {
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
-        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
        fprintf(stderr, "%s: interactive mode on.\n", __func__);
@ -352,10 +345,9 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "\n");
        {
-            auto it = params.logit_bias.find(llama_token_eos());
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                fprintf(stderr,
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
                    "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }
@ -405,7 +397,7 @@ int main(int argc, char ** argv) {
    // do one empty run to warm up the model
    {
-        const std::vector<llama_token> tmp = { llama_token_bos(), };
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(ctx);
    }
@ -589,7 +581,7 @@ int main(int argc, char ** argv) {
                }
                // Apply penalties
-                float nl_logit = logits[llama_token_nl()];
+                float nl_logit = logits[llama_token_nl(ctx)];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                llama_sample_repetition_penalty(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -598,7 +590,7 @@ int main(int argc, char ** argv) {
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
-                    logits[llama_token_nl()] = nl_logit;
+                    logits[llama_token_nl(ctx)] = nl_logit;
                }
                if (grammar != NULL) {
@ -662,7 +654,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
+                printf("%s", llama_token_to_str(ctx, id).c_str());
            }
            fflush(stdout);
        }
@ -704,7 +696,7 @@ int main(int argc, char ** argv) {
            }
            // deal with end of text token in interactive mode
-            if (last_n_tokens.back() == llama_token_eos()) {
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
                if (params.interactive) {
                    if (params.antiprompt.size() != 0) {
                        // tokenize and inject first reverse prompt
@ -728,7 +720,7 @@ int main(int argc, char ** argv) {
                }
                if (params.input_prefix_bos) {
-                    embd_inp.push_back(llama_token_bos());
+                    embd_inp.push_back(llama_token_bos(ctx));
                }
                std::string buffer;
@ -782,8 +774,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);
-                        std::vector<const llama_grammar_element *> grammar_rules(
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                            parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
@ -794,7 +785,7 @@ int main(int argc, char ** argv) {
        }
        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
            fprintf(stderr, " [end of text]\n");
            break;
        }
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@ -2,7 +2,7 @@
 //
 // - First, export a LLaMA graph:
 //
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
 //
 // - Run this tool to evaluate the exported graph:
 //
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -5,6 +5,7 @@
 #include <cmath>
 #include <ctime>
 #include <sstream>
 #include <cstring>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -63,7 +64,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
            // add BOS token for the first batch of each chunk
            if (j == 0) {
-                tokens[batch_start] = llama_token_bos();
+                tokens[batch_start] = llama_token_bos(ctx);
            }
            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
@ -88,7 +89,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%d minutes\n", total_seconds / 60);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
        // We get the logits for all the tokens in the context window (params.n_ctx)
@ -121,6 +122,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    printf("\n");
 }
 std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
        int n_vocab, int n_thread) {
    std::vector<float> result;
    result.reserve(tokens.size() * n_vocab);
    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
        size_t n_tokens = tokens.size() - i_chunk * n_batch;
        n_tokens = std::min(n_tokens, size_t(n_batch));
        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return {};
        }
        const auto logits = llama_get_logits(ctx);
        result.insert(result.end(), logits, logits + n_tokens * n_vocab);
        n_past += n_tokens;
    }
    return result;
 }
 void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // Calculates hellaswag score (acc_norm) from prompt
    //
@ -209,50 +231,93 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);
    std::vector<float> tok_logits(n_vocab);
    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
        // Tokenize the context to count tokens
        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
        size_t context_size = context_embd.size();
-        for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
+        // Do the 1st ending
        // In this case we include the context when evaluating
        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
        auto query_size = query_embd.size();
        //printf("First query: %d\n",(int)query_size);
        // Stop if query wont fit the ctx window
        if (query_size > (size_t)params.n_ctx) {
            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
            return;
        }
        // Speedup small evaluations by evaluating atleast 32 tokens
        if (query_size < 32) {
            query_embd.resize(32);
        }
        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
        if (logits.empty()) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return;
        }
        std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
        const auto first_probs = softmax(tok_logits);
        hs_data[task_idx].ending_logprob_count[0] = 1;
        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
        // Calculate the logprobs over the ending
        for (size_t j = context_size; j < query_size - 1; j++) {
            std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
            const float prob = softmax(tok_logits)[query_embd[j + 1]];
            hs_data[task_idx].ending_logprob[0] += std::log(prob);
            hs_data[task_idx].ending_logprob_count[0]++;
        }
        // Calculate the mean token logprob for acc_norm
        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
        // Do the remaining endings
        // For these, we use the bare ending with n_past = context_size
        //
        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
            // Tokenize the query
-            std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
+            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
-            size_t query_size = query_embd.size();
+            query_size = query_embd.size();
            // Stop if query wont fit the ctx window
-            if (query_size > (size_t)params.n_ctx) {
+            if (context_size + query_size > (size_t)params.n_ctx) {
                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
                return;
            }
            // Speedup small evaluations by evaluating atleast 32 tokens
-            if (query_size < 32) {
+            // No, resizing to 32 is actually slightly slower (at least on CUDA)
-                query_embd.resize(32);
+            //if (query_size < 32) {
-            }
+            //    query_embd.resize(32);
            //}
            // Evaluate the query
-            if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
            if (logits.empty()) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
-            const auto query_logits = llama_get_logits(ctx);
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
-            std::vector<float> logits;
+            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
            logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
            hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
            hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
            // Calculate the logprobs over the ending
-            for (size_t j = context_size-1; j < query_size - 1; j++) {
+            for (size_t j = 0; j < query_size - 1; j++) {
-                // Calculate probability of next token, given the previous ones.
+                std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
                const std::vector<float> tok_logits(
                    logits.begin() + (j + 0) * n_vocab,
                    logits.begin() + (j + 1) * n_vocab);
-                const float prob = softmax(tok_logits)[query_embd[ j + 1]];
+                const float prob = softmax(tok_logits)[query_embd[j + 1]];
                hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
                hs_data[task_idx].ending_logprob_count[ending_idx]++;
@ -267,9 +332,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }
        // Find the ending with maximum logprob
-        size_t ending_logprob_max_idx = -1;
+        size_t ending_logprob_max_idx = 0;
-        double ending_logprob_max_val = -INFINITY;
+        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
-        for (size_t j=0; j < 4; j++) {
+        for (size_t j = 1; j < 4; j++) {
            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
                ending_logprob_max_idx = j;
                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -24,7 +24,7 @@
 #endif
 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.bin";
+    std::string model = "models/7B/ggml-model-f16.gguf";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
+        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
    }
    else {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();
    lparams.n_ctx     = params.n_ctx;
    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
@ -45,9 +44,8 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = std::vector<llama_token>(params.n_ctx);
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
+    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        llama_free(ctx);
@ -92,7 +90,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx, next_token);
        last_n_tokens_data.push_back(next_token);
-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@ -152,7 +150,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);
-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
 Command line options:
 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@ -48,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./server -m models/7B/ggml-model.bin -c 2048
+./server -m models/7B/ggml-model.gguf -c 2048
 ```
 ### Windows:
 ```powershell
-server.exe -m models\7B\ggml-model.bin -c 2048
+server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline
 FILES=$(ls $PUBLIC)
 cd $PUBLIC
 for FILE in $FILES; do
-  func=$(echo $FILE | tr '.' '_')
+  echo "generate $FILE.hpp"
-  echo "generate $FILE.hpp ($func)"
+
-  xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
+  # use simple flag for old version of xxd
  xxd -i $FILE > $DIR/$FILE.hpp
 done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -144,12 +144,12 @@
    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
    const session = signal({
-      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
      type: "chat",
-      char: "llama",
+      char: "Llama",
      user: "User",
    })
@ -170,6 +170,136 @@
      grammar: '',
    })
    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
    const local_storage_storageKey = "llamacpp_server_local_storage";
    function local_storage_setDataFromObject(tag, content) {
      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
    }
    function local_storage_setDataFromRawText(tag, content) {
      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
    }
    function local_storage_getDataAsObject(tag) {
      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
      if (!item) {
        return null;
      } else {
        return JSON.parse(item);
      }
    }
    function local_storage_getDataAsRawText(tag) {
      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
      if (!item) {
        return null;
      } else {
        return item;
      }
    }
    // create a container for user templates and settings
    const savedUserTemplates = signal({})
    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
    // let's import locally saved templates and settings if there are any
    // user templates and settings are stored in one object
    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
    console.log('Importing saved templates')
    let importedTemplates = local_storage_getDataAsObject('user_templates')
    if (importedTemplates) {
      // saved templates were successfuly imported.
      console.log('Processing saved templates and updating default template')
      //console.log(importedTemplates);
      savedUserTemplates.value = importedTemplates;
      //override default template
      savedUserTemplates.value.default = { session: session.value, params: params.value }
      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
    } else {
      // no saved templates detected.
      console.log('Initializing LocalStorage and saving default template')
      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
    }
    function userTemplateResetToDefault() {
      console.log('Reseting themplate to default')
      selectedUserTemplate.value.name = 'default';
      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
    }
    function userTemplateApply(t) {
      session.value = t.data.session;
      params.value = t.data.params;
    }
    function userTemplateResetToDefaultAndApply() {
      userTemplateResetToDefault()
      userTemplateApply(selectedUserTemplate.value)
    }
    function userTemplateLoadAndApplyAutosaved() {
      // get autosaved last used template
      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
      if (lastUsedTemplate) {
        console.log('Autosaved template found, restoring')
        selectedUserTemplate.value = lastUsedTemplate
      }
      else {
        console.log('No autosaved template found, using default template')
        // no autosaved last used template was found, so load from default.
        userTemplateResetToDefault()
      }
      console.log('Applying template')
      // and update internal data from templates
      userTemplateApply(selectedUserTemplate.value)
    }
    //console.log(savedUserTemplates.value)
    //console.log(selectedUserTemplate.value)
    function userTemplateAutosave() {
      console.log('Template Autosave...')
      if (selectedUserTemplate.value.name == 'default') {
        // we don't want to save over default template, so let's create a new one
        let newTemplateName = 'UserTemplate-' + Date.now().toString()
        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
        console.log('Saving as ' + newTemplateName)
        // save in the autosave slot
        local_storage_setDataFromObject('user_templates_last', newTemplate)
        // and load it back and apply
        userTemplateLoadAndApplyAutosaved()
      } else {
        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
      }
    }
    console.log('Checking for autosaved last used template')
    userTemplateLoadAndApplyAutosaved()
    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
    const llamaStats = signal(null)
    const controller = signal(null)
@ -346,8 +476,34 @@
        `
      };
      const userTemplateReset = (e) => {
        e.preventDefault();
        userTemplateResetToDefaultAndApply()
      }
      const UserTemplateResetButton = () => {
        if (selectedUserTemplate.value.name == 'default') {
          return html`
            <button disabled>Using default template</button>
          `
        }
        return html`
          <button onclick=${userTemplateReset}>Reset all to default</button>
        `
      };
      useEffect(() => {
        // autosave template on every change
        userTemplateAutosave()
      }, [session.value, params.value])
      return html`
        <form>
          <fieldset>
            <${UserTemplateResetButton}/>
          </fieldset>
          <fieldset>
            <div>
              <label for="prompt">Prompt</label>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -279,7 +279,7 @@ struct llama_server_context
            grammar_parser::print_grammar(stderr, parsed_grammar);
            {
-                auto it = params.logit_bias.find(llama_token_eos());
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
                if (it != params.logit_bias.end() && it->second == -INFINITY) {
                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
                }
@ -402,7 +402,7 @@ struct llama_server_context
        if (params.n_predict == 0)
        {
            has_next_token = false;
-            result.tok = llama_token_eos();
+            result.tok = llama_token_eos(ctx);
            return result;
        }
@ -442,7 +442,7 @@ struct llama_server_context
            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
            // Apply penalties
-            float nl_logit = logits[llama_token_nl()];
+            float nl_logit = logits[llama_token_nl(ctx)];
            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
            llama_sample_repetition_penalty(ctx, &candidates_p,
                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -452,7 +452,7 @@ struct llama_server_context
                                                          last_n_repeat, alpha_frequency, alpha_presence);
            if (!penalize_nl)
            {
-                logits[llama_token_nl()] = nl_logit;
+                logits[llama_token_nl(ctx)] = nl_logit;
            }
            if (grammar != nullptr) {
@ -515,7 +515,7 @@ struct llama_server_context
        // decrement remaining sampling budget
        --n_remain;
-        if (!embd.empty() && embd.back() == llama_token_eos())
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
        {
            // stopping_word = llama_token_to_str(ctx, embd.back());
            has_next_token = false;
@ -652,8 +652,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -774,23 +772,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
        else if (arg == "-gqa" || arg == "--gqa")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        }
        else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
@ -968,7 +949,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);
@ -1103,7 +1084,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
    llama.params.logit_bias.clear();
    if (body.value("ignore_eos", false))
    {
-        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }
    const auto &logit_bias = body.find("logit_bias");
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -2,180 +2,129 @@
 #define _GNU_SOURCE
 #endif
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
-#include <cassert>
+#include "common.h"
-#include <cinttypes>
+#include "llama.h"
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+int main(int argc, char ** argv) {
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
 int main(int argc, char ** argv)
 {
    gpt_params params;
-    //---------------------------------
+    if (argc == 1 || argv[1][0] == '-') {
-    // Print help :
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
    //---------------------------------
    if ( argc == 1 || argv[1][0] == '-' )
    {
        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
        return 1 ;
    }
-    //---------------------------------
+    if (argc >= 2) {
    // Load parameters :
    //---------------------------------
    if ( argc >= 2 )
    {
        params.model = argv[1];
    }
-    if ( argc >= 3 )
+    if (argc >= 3) {
    {
        params.prompt = argv[2];
    }
-    if ( params.prompt.empty() )
+    if (params.prompt.empty()) {
    {
        params.prompt = "Hello my name is";
    }
-    //---------------------------------
+    // init LLM
    // Init LLM :
    //---------------------------------
    llama_backend_init(params.numa);
-    llama_model * model;
+    llama_context_params ctx_params = llama_context_default_params();
    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
-    if ( model == NULL )
+    if (model == NULL) {
-    {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }
-    //---------------------------------
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-    // Tokenize the prompt :
+
-    //---------------------------------
+    // tokenize the prompt
    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
-    const int max_context_size     = llama_n_ctx( ctx );
+    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_tokens_list_size = max_context_size - 4;
-    if ( (int)tokens_list.size() > max_tokens_list_size )
+    if ((int) tokens_list.size() > max_tokens_list_size) {
-    {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
             __func__ , (int)tokens_list.size() , max_tokens_list_size );
        return 1;
    }
-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");
-    // Print the tokens from the prompt :
+    for (auto id : tokens_list) {
-
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
    for( auto id : tokens_list )
    {
        printf( "%s" , llama_token_to_str( ctx , id ) );
    }
-    fflush(stdout);
+    fflush(stderr);
-
+    // main loop
    //---------------------------------
    // Main prediction loop :
    //---------------------------------
    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
+    const int n_gen = std::min(32, max_context_size);
    {
        //---------------------------------
        // Evaluate the tokens :
        //---------------------------------
-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
+    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
-        {
+        // evaluate the transformer
-            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+
        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
        tokens_list.clear();
-        //---------------------------------
+        // sample the next token
        // Select the best prediction :
        //---------------------------------
        llama_token new_token_id = 0;
-        auto logits  = llama_get_logits( ctx );
+        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto n_vocab = llama_n_vocab(ctx);
        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);
-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        // Select it using the "Greedy sampling" method :
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
+        if (new_token_id == llama_token_eos(ctx)) {
        {
            fprintf(stderr, " [end of text]\n");
            break;
        }
-        // Print the new token :
+        // print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
+        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
-        fflush( stdout );
+        fflush(stdout);
-        // Push this new token for next evaluation :
+        // push this new token for next evaluation
-        tokens_list.push_back( new_token_id );
+        tokens_list.push_back(new_token_id);
    }
-    } // wend of main loop
+    llama_free(ctx);
-
+    llama_free_model(model);
    llama_free( ctx );
    llama_free_model( model );
    llama_backend_free();
    fprintf(stderr, "\n\n");
    return 0;
 }
 // EOF
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -1,4 +1,5 @@
 #include "ggml.h"
 #include "common.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@ -16,7 +17,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+static const float rms_norm_eps = 1e-5f;
 struct random_normal_distribution {
    std::mt19937 gen;
@ -169,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
    using ttype = llama_token_type;
-    struct token_score {
+    struct token_data {
-        token tok;
+        token text;
        float score;
        ttype type;
    };
    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };
 struct my_llama_hparams {
@ -1961,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
+    printf("%s", llama_token_to_str(ctx, token).c_str());
 }
 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@ -1995,7 +1998,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
    }
 }
-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    int n_tokens = tokens_input->ne[0];
    int n_vocab  = target_logits->ne[0];
@ -2004,7 +2007,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
    ggml_set_f32(target_logits, -1.0f/n_vocab);
    ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
    for (int i=1; i<n_tokens+1; ++i) {
        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
        set_f32_2d(target_logits, token, i-1, +1.0f);
@ -2015,7 +2018,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
    }
 }
-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    GGML_ASSERT(tokens_input->n_dims  == 2);
    GGML_ASSERT(target_logits->n_dims == 3);
    GGML_ASSERT(target_probs->n_dims  == 3);
@ -2035,7 +2038,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
        GGML_ASSERT(sample+n_tokens-1 < n_train_data);
-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
        for (int i=1; i<n_tokens+1; ++i) {
            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
            // print_token(lctx, token);
@ -2188,11 +2191,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
    f.read_raw(buf.data(), f.size);
    buf[f.size] = '\0';
-    out.resize(buf.size());
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
-
+    if (n_tokens < 0) {
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
+        out.resize(-n_tokens);
-    if (n_tokens >= 0) {
+        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
        out.resize(n_tokens);
    }
    bool verify = false;
@ -2200,17 +2202,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
        const char * in  = buf.data();
        const char * end = buf.data() + buf.size();
        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
+            std::string s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
+            int len = s.length();
            if (in >= end) {
                printf("%s: unexpected end of original text.\n", __func__);
                break;
            }
-            const bool matches = (strncmp(in, s, len) == 0);
+            const bool matches = (strncmp(in, s.c_str(), len) == 0);
            if (matches) {
                in += len;
            } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
            }
        }
    }
@ -2294,7 +2296,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
    const auto params = sampler->params;
    // Apply penalties
-    const float nl_logit = logits[llama_token_nl()];
+    const float nl_logit = logits[llama_token_nl(ctx)];
    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
@ -2313,7 +2315,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
        params.alpha_presence);
    if (!params.penalize_nl) {
-        logits[llama_token_nl()] = nl_logit;
+        logits[llama_token_nl(ctx)] = nl_logit;
    }
    llama_token token = 0;
@ -2612,42 +2614,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
        return;
    }
-    // write_magic
+#pragma message("TODO: implement file saving using gguf")
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    (void) vocab;
-    file.write_u32(LLAMA_FILE_VERSION); // version
+    (void) model;
-    // write_hparams
+//    // write_magic
-    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(LLAMA_FILE_VERSION); // version
-    file.write_u32(model->hparams.n_mult);
+//    // write_hparams
-    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(model->hparams.n_mult);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//    file.write_u32(model->hparams.n_head);
-    // write_vocab
+//    file.write_u32(model->hparams.n_layer);
-    uint32_t n_vocab = model->hparams.n_vocab;
+//    file.write_u32(model->hparams.n_rot);
-    for (uint32_t i = 0; i < n_vocab; i++) {
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
-        const auto & token_score = vocab->id_to_token.at(i);
+//    // write_vocab
-        file.write_u32((uint32_t) token_score.tok.size());
+//    uint32_t n_vocab = model->hparams.n_vocab;
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
+//    for (uint32_t i = 0; i < n_vocab; i++) {
-        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_data = vocab->id_to_token.at(i);
-    }
+//        file.write_u32((uint32_t) token_data.tok.size());
-    // write tensors
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
-    write_tensor(&file, model->tok_embeddings);
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
-    write_tensor(&file, model->norm);
+//    }
-    write_tensor(&file, model->output);
+//    // write tensors
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//    write_tensor(&file, model->tok_embeddings);
-        auto & layer = model->layers[i];
+//    write_tensor(&file, model->norm);
-
+//    write_tensor(&file, model->output);
-        write_tensor(&file, layer.attention_norm);
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        write_tensor(&file, layer.wq);
+//        auto & layer = model->layers[i];
-        write_tensor(&file, layer.wk);
+//
-        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.w3);
+//        write_tensor(&file, layer.ffn_norm);
-    }
+//        write_tensor(&file, layer.w1);
 //        write_tensor(&file, layer.w2);
 //        write_tensor(&file, layer.w3);
 //    }
 }
 float cosine_decay(const int decay_steps, const float alpha, int step) {
@ -3052,20 +3057,13 @@ int main(int argc, char ** argv) {
    struct llama_vocab vocab;
    {
-        std::vector<const char *> strings;
+        const int n_vocab = llama_n_vocab(lctx);
        std::vector<float> scores;
        int n_vocab = llama_n_vocab(lctx);
        strings.resize(n_vocab, NULL);
        scores.resize(n_vocab, 0);
        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
        vocab.id_to_token.resize(n_vocab);
        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
-            float       score = scores[i];
+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
-            vocab.id_to_token[i].tok   = tok;
+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
-            vocab.id_to_token[i].score = score;
+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
            vocab.token_to_id.emplace(tok, i);
        }
    }
@ -3178,7 +3176,7 @@ int main(int argc, char ** argv) {
    std::vector<int> train_samples;
    train_samples.push_back(0);
    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
            train_samples.push_back(i);
        }
    }
@ -3338,7 +3336,7 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
-        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
+        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
        for (int i=sample_ctx; i<n_tokens; ++i) {
            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
        }
--- a/flake.nix
+++ b/flake.nix
@ -14,8 +14,6 @@
            with pkgs.darwin.apple_sdk_11_0.frameworks; [
              Accelerate
              MetalKit
              MetalPerformanceShaders
              MetalPerformanceShadersGraph
            ]
          else if isAarch32 && isDarwin then
            with pkgs.darwin.apple_sdk.frameworks; [
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -67,6 +67,8 @@ struct ggml_allocr {
    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
    size_t max_size;
    bool measure;
    int parse_seq[GGML_MAX_NODES];
    bool has_parse_seq;
 #ifdef GGML_ALLOCATOR_DEBUG
    struct ggml_tensor * allocated_tensors[1024];
@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    size_t max_avail = 0;
-    // find the best fitting free block
+    // find the best fitting free block besides the last block
    int best_fit_block = -1;
    size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
        struct free_block * block = &alloc->free_blocks[i];
        max_avail = MAX(max_avail, block->size);
        if (block->size >= size && block->size <= best_fit_size) {
@ -126,10 +128,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    AT_PRINTF("block %d\n", best_fit_block);
    if (best_fit_block == -1) {
-        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+        // the last block is our last resort
-                __func__, size, max_avail);
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
-        GGML_ASSERT(!"not enough space in the buffer");
+        if (block->size >= size) {
            best_fit_block = alloc->n_free_blocks - 1;
            max_avail = MAX(max_avail, block->size);
        } else {
            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                    __func__, size, max_avail);
            GGML_ASSERT(!"not enough space in the buffer");
        return;
        }
    }
    struct free_block * block = &alloc->free_blocks[best_fit_block];
    void * addr = block->addr;
@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
    alloc->n_free_blocks++;
 }
 void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
    int pos = 0;
    for (int i = 0; i < n; i++) {
        if (list[i] != -1) {
            alloc->parse_seq[pos] = list[i];
            pos++;
        }
    }
    alloc->has_parse_seq = true;
 }
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
    alloc->n_free_blocks = 1;
    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@ -248,6 +268,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
        /*.measure       = */ false,
        /*.parse_seq     = */ {0},
        /*.has_parse_seq = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@ -275,6 +297,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
        /*.measure       = */ true,
        /*.parse_seq     = */ {0},
        /*.has_parse_seq = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@ -473,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                allocate_node(alloc, input);
            }
        }
-        for (int i = 0; i < gf->n_nodes; i++) {
+        for (int ind = 0; ind < gf->n_nodes; ind++) {
            int i;
            if (alloc->has_parse_seq) {
                i = alloc->parse_seq[ind];
            } else {
                i = ind;
            }
            struct ggml_tensor * node = gf->nodes[i];
            // allocate parents (leafs)
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -10,6 +10,10 @@ extern "C" {
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
 GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
 GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6578,3 +6578,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
    func(tensor->src[0], tensor->src[1], tensor);
    return true;
 }
 int ggml_cuda_get_device_count() {
    int device_count;
    CUDA_CHECK(cudaGetDeviceCount(&device_count));
    return device_count;
 }
 void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
    cudaDeviceProp prop;
    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
    snprintf(description, description_size, "%s", prop.name);
 }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -8,29 +8,25 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
-void   ggml_init_cublas(void);
+GGML_API void   ggml_init_cublas(void);
-void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void * ggml_cuda_host_malloc(size_t size);
 GGML_API void   ggml_cuda_host_free(void * ptr);
-void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
-void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_set_main_device(int main_device);
 GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
 GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
 GGML_API void   ggml_cuda_free_scratch(void);
 GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-// TODO: export these with GGML_API
+GGML_API int    ggml_cuda_get_device_count(void);
-void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 void   ggml_cuda_host_free(void * ptr);
 void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
 void   ggml_cuda_free_data(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
 void   ggml_cuda_set_main_device(int main_device);
 void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
 void   ggml_cuda_set_scratch_size(size_t scratch_size);
 void   ggml_cuda_free_scratch(void);
 bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -38,6 +38,9 @@ struct ggml_metal_context;
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
 void * ggml_metal_host_malloc(size_t n);
 void   ggml_metal_host_free  (void * data);
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
@ -63,10 +66,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 // try to find operations that can be run concurrently in the graph
 // you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
-// if the graph has been optimized for concurrently dispatch
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
 // output the concur_list for ggml_alloc
 int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -5,7 +5,6 @@
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 #undef MIN
 #undef MAX
@ -79,6 +78,14 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
    GGML_METAL_DECL_KERNEL(rope);
    GGML_METAL_DECL_KERNEL(alibi_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@ -110,13 +117,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    ctx->n_buffers = 0;
    ctx->concur_list_len = 0;
    // determine if we can use MPS
    if (MPSSupportsMTLDevice(ctx->device)) {
        fprintf(stderr, "%s: using MPS\n", __func__);
    } else {
        fprintf(stderr, "%s: not using MPS\n", __func__);
        GGML_ASSERT(false && "MPS not supported");
    }
 #if 0
    // compile from source string and show compile log
@ -163,10 +163,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    // load kernels
    {
        NSError * error = nil;
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
-        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
        if (error) { \
            fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }
        GGML_METAL_ADD_KERNEL(add);
        GGML_METAL_ADD_KERNEL(add_row);
@ -196,6 +201,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
        GGML_METAL_ADD_KERNEL(rope);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@ -224,15 +237,31 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    free(ctx);
 }
 void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
    const int result = posix_memalign((void **) &data, getpagesize(), n);
    if (result != 0) {
        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
        return NULL;
    }
    return data;
 }
 void ggml_metal_host_free(void * data) {
    free(data);
 }
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = n_cb;
 }
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
-    if (ctx->concur_list_len) {
+    return ctx->concur_list_len;
-        return true;
+}
-    }
+
-    return false;
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
    return ctx->concur_list;
 }
 // finds the Metal buffer that contains the tensor data on the GPU device
@ -375,7 +404,7 @@ void ggml_metal_get_tensor(
 void ggml_metal_graph_find_concurrency(
        struct ggml_metal_context * ctx,
-        struct ggml_cgraph * gf) {
+        struct ggml_cgraph * gf, bool check_mem) {
    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
    int nodes_unused[GGML_MAX_CONCUR];
@ -422,7 +451,7 @@ void ggml_metal_graph_find_concurrency(
                        }
                    }
                }
-                if (exe_flag) {
+                if (exe_flag && check_mem) {
                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
                    int64_t data_start = (int64_t) gf->nodes[i]->data;
@ -506,7 +535,7 @@ void ggml_metal_graph_compute(
            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = nil;
+            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
            const int node_start =                                  (cb_idx + 0) * n_nodes_per_cb;
            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
@ -515,10 +544,6 @@ void ggml_metal_graph_compute(
                const int i = has_concur ? ctx->concur_list[ind] : ind;
                if (i == -1) {
                    if (encoder == nil) {
                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                        continue;
                    }
                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
                    continue;
                }
@ -592,10 +617,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ADD:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
@ -613,10 +634,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
@ -634,10 +651,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SCALE:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const float scale = *(const float *) src1->data;
                            [encoder setComputePipelineState:ctx->pipeline_scale];
@ -653,10 +666,6 @@ void ggml_metal_graph_compute(
                        switch (ggml_get_unary_op(gf->nodes[i])) {
                            case GGML_UNARY_OP_SILU:
                                {
                                    if (encoder == nil) {
                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                    }
                                    [encoder setComputePipelineState:ctx->pipeline_silu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -667,10 +676,6 @@ void ggml_metal_graph_compute(
                                } break;
                            case GGML_UNARY_OP_RELU:
                                {
                                    if (encoder == nil) {
                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                    }
                                    [encoder setComputePipelineState:ctx->pipeline_relu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -681,10 +686,6 @@ void ggml_metal_graph_compute(
                                } break;
                            case GGML_UNARY_OP_GELU:
                                {
                                    if (encoder == nil) {
                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                    }
                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -701,10 +702,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int nth = 32;
                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
@ -719,10 +716,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_DIAG_MASK_INF:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int n_past = ((int32_t *)(dst->op_params))[0];
                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
@ -740,53 +733,43 @@ void ggml_metal_graph_compute(
                            GGML_ASSERT(ne00 == ne10);
                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
                            uint gqa = ne12/ne02;
                            GGML_ASSERT(ne03 == ne13);
                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                            if (ggml_is_contiguous(src0) &&
                                ggml_is_contiguous(src1) &&
-                                (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
+                                src1t == GGML_TYPE_F32 &&
-
+                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                if (encoder != nil) {
+                                ne00%32 == 0 &&
-                                    [encoder endEncoding];
+                                ne11 > 1) {
-                                    encoder = nil;
+                                    switch (src0->type) {
                                        case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
                                        case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
                                        case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
                                        case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
                                        case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
                                        case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
                                        case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
                                        case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
                                        default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                                    }
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
                                    [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
                                    [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
                                    [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
                                    [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
                                    [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
                                    [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
                                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                                }
-
+                            else {
                                MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
                                MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
                                // for F32 x F32 we use MPS
                                MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
                                    matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
                                MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
                                    matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
                                MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
                                    matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
                                MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
                                // we need to do ne12 multiplications
                                // TODO: is there a way to do this in parallel - currently very slow ..
                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
                                for (int64_t i02 = 0; i02 < ne12; ++i02) {
                                    size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;
                                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
                                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
                                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
                                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
                                }
                            } else {
                                if (encoder == nil) {
                                    encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                }
                                int nth0 = 32;
                                int nth1 = 1;
@ -885,23 +868,24 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
                                [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                                }
                                else if (src0t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -910,10 +894,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_GET_ROWS:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            switch (src0->type) {
                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
@ -939,10 +919,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));
@ -962,10 +938,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_NORM:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const float eps = 1e-5f;
                            const int nth = 256;
@ -984,10 +956,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ALIBI:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            GGML_ASSERT((src0t == GGML_TYPE_F32));
                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
@ -1027,10 +995,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ROPE:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int n_past = ((int32_t *) dst->op_params)[0];
                            const int n_dims = ((int32_t *) dst->op_params)[1];
                            const int mode   = ((int32_t *) dst->op_params)[2];
@ -1071,10 +1035,6 @@ void ggml_metal_graph_compute(
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int nth = 32;
                            switch (src0t) {
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -207,14 +207,18 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
-#define GGML_MAX_NAME          48
+#define GGML_MAX_NAME          64
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 #define GGUF_MAGIC   0x46554747 // "GGUF"
 #define GGUF_VERSION 1
 #define GGUF_DEFAULT_ALIGNMENT 32
 #define GGML_UNUSED(x) (void)(x)
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@ -562,6 +566,7 @@ extern "C" {
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
    GGML_API int     ggml_blck_size (enum ggml_type type);
@ -1494,7 +1499,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
@ -1703,6 +1707,118 @@ extern "C" {
    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
    //
    // gguf
    //
    enum gguf_type {
        GGUF_TYPE_UINT8   = 0,
        GGUF_TYPE_INT8    = 1,
        GGUF_TYPE_UINT16  = 2,
        GGUF_TYPE_INT16   = 3,
        GGUF_TYPE_UINT32  = 4,
        GGUF_TYPE_INT32   = 5,
        GGUF_TYPE_FLOAT32 = 6,
        GGUF_TYPE_BOOL    = 7,
        GGUF_TYPE_STRING  = 8,
        GGUF_TYPE_ARRAY   = 9,
        GGUF_TYPE_COUNT,       // marks the end of the enum
    };
    struct gguf_context;
    struct gguf_init_params {
        bool no_alloc;
        // if not NULL, create a ggml_context and allocate the tensor data in it
        struct ggml_context ** ctx;
    };
    GGML_API struct gguf_context * gguf_init_empty(void);
    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
    GGML_API void gguf_free(struct gguf_context * ctx);
    GGML_API const char * gguf_type_name(enum gguf_type type);
    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
    // results are undefined if the wrong type is used for the key
    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
    // set or add KV pairs from another context
    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
    // manage tensor info
    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
    // writing gguf files can be done in 2 ways:
    //
    // - write the entire gguf_context to a binary file in a single pass:
    //
    //   gguf_write_to_file(ctx, fname);
    //
    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
    //
    //   FILE * f = fopen(fname, "wb");
    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
    //   fwrite(f, ...);
    //   void * data = gguf_meta_get_meta_data(ctx);
    //   fseek(f, 0, SEEK_SET);
    //   fwrite(f, data, gguf_get_meta_size(ctx));
    //   free(data);
    //   fclose(f);
    //
    // write the entire context to a binary file
    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
    //
    // system info
    //
@ -1740,6 +1856,10 @@ extern "C" {
    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
    typedef struct {
        const char      * type_name;
        int               blck_size;
        size_t            type_size;
        bool              is_quantized;
        ggml_to_float_t   to_float;
        ggml_from_float_t from_float;
        ggml_from_float_t from_float_reference;
@ -1747,7 +1867,7 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;
-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 }
--- a/gguf.py
+++ b/gguf.py
@ -0,0 +1,718 @@
 import shutil
 import sys
 import struct
 import tempfile
 import numpy as np
 from enum import IntEnum, auto
 from typing import Any, IO, List, Optional
 #
 # constants
 #
 GGUF_MAGIC             = 0x46554747
 GGUF_VERSION           = 1
 GGUF_DEFAULT_ALIGNMENT = 32
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
 KEY_GENERAL_ALIGNMENT            = "general.alignment"
 KEY_GENERAL_NAME                 = "general.name"
 KEY_GENERAL_AUTHOR               = "general.author"
 KEY_GENERAL_URL                  = "general.url"
 KEY_GENERAL_DESCRIPTION          = "general.description"
 KEY_GENERAL_LICENSE              = "general.license"
 KEY_GENERAL_SOURCE_URL           = "general.source.url"
 KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
 # LLM
 KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length"
 KEY_LLM_EMBEDDING_LENGTH      = "{arch}.embedding_length"
 KEY_LLM_BLOCK_COUNT           = "{arch}.block_count"
 KEY_LLM_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
 KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
 KEY_LLM_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
 # attention
 KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
 KEY_ATTENTION_HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
 KEY_ATTENTION_MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
 KEY_ATTENTION_CLAMP_KQV         = "{arch}.attention.clamp_kqv"
 KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
 KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 # RoPE
 KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
 KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
 KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
 KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
 KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
 KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
 KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
 KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
 KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
 KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
 KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
 KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
 KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
 #
 # recommended mapping of model tensor names for storage in gguf
 #
 class MODEL_ARCH(IntEnum):
    LLAMA   = auto()
    FALCON  = auto()
    GPT2    = auto()
    GPTJ    = auto()
    GPTNEOX = auto()
    MPT     = auto()
 class MODEL_TENSOR(IntEnum):
    TOKEN_EMBD    = auto()
    POS_EMBD      = auto()
    OUTPUT        = auto()
    OUTPUT_NORM   = auto()
    ROPE_FREQS    = auto()
    ATTN_Q        = auto()
    ATTN_K        = auto()
    ATTN_V        = auto()
    ATTN_QKV      = auto()
    ATTN_OUT      = auto()
    ATTN_NORM     = auto()
    ATTN_NORM_2   = auto()
    ATTN_ROT_EMBD = auto()
    FFN_GATE      = auto()
    FFN_DOWN      = auto()
    FFN_UP        = auto()
    FFN_NORM      = auto()
 MODEL_ARCH_NAMES = {
    MODEL_ARCH.LLAMA:   "llama",
    MODEL_ARCH.FALCON:  "falcon",
    MODEL_ARCH.GPT2:    "gpt2",
    MODEL_ARCH.GPTJ:    "gptj",
    MODEL_ARCH.GPTNEOX: "gptneox",
    MODEL_ARCH.MPT:     "mpt",
 }
 MODEL_TENSOR_NAMES = {
    MODEL_ARCH.LLAMA: {
        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
        MODEL_TENSOR.OUTPUT:        "output",
        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
    },
    MODEL_ARCH.GPTNEOX: {
        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
        MODEL_TENSOR.OUTPUT:        "output",
        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
    },
    MODEL_ARCH.FALCON: {
        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
        MODEL_TENSOR.OUTPUT:      "output",
        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
    },
    MODEL_ARCH.GPT2: {
        # TODO
    },
    # TODO
 }
 # tensors that will not be serialized
 MODEL_TENSOR_SKIP = {
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
 }
 # TODO: the following helper functions should be removed
 #       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
 #       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
 # REMOVE
 def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
    for skip in MODEL_TENSOR_SKIP.get(arch, []):
        for i in range(n_blocks):
            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
                return True
    return False
 def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
    tensor_map = {}
    # Token embeddings
    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
    # Position embeddings
    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
    tensor_map["transformer.wpe"] = mapped_to  # gpt2
    # Output
    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
    tensor_map["embed_out"] = mapped_to  # gptneox
    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
    tensor_map["output"]    = mapped_to  # llama-pth
    # Output norm
    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
    tensor_map["model.norm"]                = mapped_to  # llama-hf
    tensor_map["norm"]                      = mapped_to  # llama-pth
    # Rope frequencies
    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
    tensor_map["rope.freqs"] = mapped_to  # llama-pth
    # Attention and feed-forward blocks
    for i in range(0, n_blocks):
        # Attention norm
        # TODO: is there are simpler way to write these 2 lines in Python?
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to else None
        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
        tensor_map["transformer.h."+str(i)+".ln_mlp"]            = mapped_to  # falcon40b
        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth
        # Attention norm 2
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to  # falcon40b
        # Attention query-key-value
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon
        # Attention query
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth
        # Attention key
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth
        # Attention value
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth
        # Attention output
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth
        # Rotary embeddings
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth
        # Feed-forward norm
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth
        # Feed-forward up
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth
        # Feed-forward gate
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth
        # Feed-forward down
        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth
    return tensor_map
 class TokenType(IntEnum):
    NORMAL       = 1
    UNKNOWN      = 2
    CONTROL      = 3
    USER_DEFINED = 4
    UNUSED       = 5
    BYTE         = 6
 #
 # implementation
 #
 class GGMLQuantizationType(IntEnum):
    F32  = 0
    F16  = 1
    Q4_0 = 2
    Q4_1 = 3
    Q5_0 = 6
    Q5_1 = 7
    Q8_0 = 8
    Q8_1 = 9
    Q2_K = 10
    Q3_K = 11
    Q4_K = 12
    Q5_K = 13
    Q6_K = 14
    Q8_K = 15
 class GGUFValueType(IntEnum):
    UINT8   = 0
    INT8    = 1
    UINT16  = 2
    INT16   = 3
    UINT32  = 4
    INT32   = 5
    FLOAT32 = 6
    BOOL    = 7
    STRING  = 8
    ARRAY   = 9
    @staticmethod
    def get_type(val):
        if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
            return GGUFValueType.STRING
        elif isinstance(val, list):
            return GGUFValueType.ARRAY
        elif isinstance(val, float):
            return GGUFValueType.FLOAT32
        elif isinstance(val, bool):
            return GGUFValueType.BOOL
        elif isinstance(val, int):
            return GGUFValueType.INT32
        else:
            print("Unknown type: "+str(type(val)))
            sys.exit()
 class GGUFWriter:
    def __init__(self, path: str, arch: str, use_temp_file = True):
        self.fout = open(path, "wb")
        self.arch = arch
        self.offset_tensor = 0
        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
        self.kv_data = b""
        self.kv_data_count = 0
        self.ti_data = b""
        self.ti_data_count = 0
        self.add_architecture()
        self.use_temp_file = use_temp_file
        self.tensors = []
    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
        self.fout.write(struct.pack("<I", GGUF_VERSION))
        self.fout.write(struct.pack("<I", self.ti_data_count))
        self.fout.write(struct.pack("<I", self.kv_data_count))
        self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
    def write_kv_data_to_file(self):
        self.fout.write(self.kv_data)
        self.flush()
    def write_ti_data_to_file(self):
        self.fout.write(self.ti_data)
        self.flush()
    def add_key(self, key: str):
        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
    def add_uint8(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT8)
    def add_int8(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT8)
    def add_uint16(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT16)
    def add_int16(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT16)
    def add_uint32(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT32)
    def add_int32(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT32)
    def add_float32(self, key: str, val: float):
        self.add_key(key)
        self.add_val(val, GGUFValueType.FLOAT32)
    def add_bool(self, key: str, val: bool):
        self.add_key(key)
        self.add_val(val, GGUFValueType.BOOL)
    def add_string(self, key: str, val: str):
        if len(val) == 0:
            return
        self.add_key(key)
        self.add_val(val, GGUFValueType.STRING)
    def add_array(self, key: str, val: list):
        if not isinstance(val, list):
            raise ValueError("Value must be a list for array type")
        self.add_key(key)
        self.add_val(val, GGUFValueType.ARRAY)
    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
        if vtype is None:
            vtype = GGUFValueType.get_type(val)
        if add_vtype:
            self.kv_data += struct.pack("<I", vtype)
            self.kv_data_count += 1
        if vtype == GGUFValueType.UINT8:
            self.kv_data += struct.pack("<B", val)
        elif vtype == GGUFValueType.INT8:
            self.kv_data += struct.pack("<b", val)
        elif vtype == GGUFValueType.UINT16:
            self.kv_data += struct.pack("<H", val)
        elif vtype == GGUFValueType.INT16:
            self.kv_data += struct.pack("<h", val)
        elif vtype == GGUFValueType.UINT32:
            self.kv_data += struct.pack("<I", val)
        elif vtype == GGUFValueType.INT32:
            self.kv_data += struct.pack("<i", val)
        elif vtype == GGUFValueType.FLOAT32:
            self.kv_data += struct.pack("<f", val)
        elif vtype == GGUFValueType.BOOL:
            self.kv_data += struct.pack("?", val)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf8") if isinstance(val, str) else val
            self.kv_data += struct.pack("<I", len(encoded_val))
            self.kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY:
            ltype = set([GGUFValueType.get_type(item) for item in val])
            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
            self.kv_data += struct.pack("<I", list(ltype)[0])
            self.kv_data += struct.pack("<I", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
            raise ValueError("Invalid GGUF metadata value type")
    @staticmethod
    def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n
    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
        encoded_name = name.encode("utf8")
        self.ti_data += struct.pack("<I", len(encoded_name))
        self.ti_data += encoded_name
        n_dims = len(tensor_shape)
        self.ti_data += struct.pack("<I", n_dims)
        for i in range(n_dims):
            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
        else:
            dtype = raw_dtype
        self.ti_data += struct.pack("<I", dtype)
        self.ti_data += struct.pack("<Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1
    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
        if self.use_temp_file and not hasattr(self, "temp_file"):
            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
            self.temp_file.seek(0)
        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
        if not self.use_temp_file:
            self.tensors.append((tensor, pad))
            return
        tensor.tofile(self.temp_file)
        if pad != 0:
            self.temp_file.write(bytes([0] * pad))
    def write_tensor_data(self, tensor: np.ndarray):
        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
        if pad != 0:
            self.fout.write(bytes([0] * pad))
        tensor.tofile(self.fout)
        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
        if pad != 0:
            self.fout.write(bytes([0] * pad))
    def write_tensors_to_file(self):
        self.write_ti_data_to_file()
        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
        if pad != 0:
            self.fout.write(bytes([0] * pad))
        if not self.use_temp_file:
            for (currtensor, currpad) in self.tensors:
                currtensor.tofile(self.fout)
                if currpad != 0:
                    self.fout.write(bytes([0] * currpad))
            return
        self.temp_file.seek(0)
        shutil.copyfileobj(self.temp_file, self.fout)
        self.flush()
        self.temp_file.close()
    def flush(self):
        self.fout.flush()
    def close(self):
        self.fout.close()
    def add_architecture(self):
        self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
    def add_author(self, author: str):
        self.add_string(KEY_GENERAL_AUTHOR, author)
    def add_tensor_data_layout(self, layout: str):
        self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
    def add_url(self, url: str):
        self.add_string(KEY_GENERAL_URL, url)
    def add_description(self, description: str):
        self.add_string(KEY_GENERAL_DESCRIPTION, description)
    def add_source_url(self, url: str):
        self.add_string(KEY_GENERAL_SOURCE_URL, url)
    def add_source_hf_repo(self, repo: str):
        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
    def add_name(self, name: str):
        self.add_string(KEY_GENERAL_NAME, name)
    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
        self.add_uint32(
            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
    def add_custom_alignment(self, alignment: int):
        self.data_alignment = alignment
        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
    def add_context_length(self, length: int):
        self.add_uint32(
            KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
    def add_embedding_length(self, length: int):
        self.add_uint32(
            KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
    def add_block_count(self, length: int):
        self.add_uint32(
            KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
    def add_feed_forward_length(self, length: int):
        self.add_uint32(
            KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
    def add_parallel_residual(self, use: bool):
        self.add_bool(
            KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
    def add_tensor_data_layout(self, layout: str):
        self.add_string(
            KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
    def add_head_count(self, count: int):
        self.add_uint32(
            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
    def add_head_count_kv(self, count: int):
        self.add_uint32(
            KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
    def add_max_alibi_bias(self, bias: float):
        self.add_float32(
            KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
    def add_clamp_kqv(self, value: float):
        self.add_float32(
            KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
    def add_layer_norm_eps(self, value: float):
        self.add_float32(
            KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
    def add_layer_norm_rms_eps(self, value: float):
        self.add_float32(
            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
    def add_rope_dimension_count(self, count: int):
        self.add_uint32(
            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
    def add_rope_scale_linear(self, value:  float):
        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
    def add_tokenizer_model(self, model: str):
        self.add_string(KEY_TOKENIZER_MODEL, model)
    def add_token_list(self, tokens: List):
        self.add_array(KEY_TOKENIZER_LIST, tokens)
    def add_token_merges(self, merges: List):
        self.add_array(KEY_TOKENIZER_MERGES, merges)
    def add_token_types(self, types: List[int]):
        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
    def add_token_scores(self, scores: List[float]):
        self.add_array(KEY_TOKENIZER_SCORES, scores)
    def add_bos_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
    def add_eos_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
    def add_unk_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
    def add_sep_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
    def add_pad_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
 # Example usage:
 if __name__ == "__main__":
    # Example usage with a file
    gguf_writer = GGUFWriter("example.gguf", "llama")
    gguf_writer.add_architecture()
    gguf_writer.add_block_count(12)
    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
    gguf_writer.add_custom_alignment(64)
    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
    gguf_writer.add_tensor("tensor1", tensor1)
    gguf_writer.add_tensor("tensor2", tensor2)
    gguf_writer.add_tensor("tensor3", tensor3)
    gguf_writer.write_header_to_file()
    gguf_writer.write_kv_data_to_file()
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()
--- a/llama-util.h
+++ b/llama-util.h
@ -1,553 +0,0 @@
 // Internal header to be included only by llama.cpp.
 // Contains wrappers around OS interfaces.
 #ifndef LLAMA_UTIL_H
 #define LLAMA_UTIL_H
 #include <cstdio>
 #include <cstdint>
 #include <cerrno>
 #include <cstring>
 #include <cstdarg>
 #include <cstdlib>
 #include <climits>
 #include <string>
 #include <vector>
 #include <stdexcept>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h> // for _fseeki64
 #endif
 #define LLAMA_ASSERT(x) \
    do { \
        if (!(x)) { \
            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
            abort(); \
        } \
    } while (0)
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
 #else
 __attribute__((format(printf, 1, 2)))
 #endif
 #endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
    va_copy(ap2, ap);
    int size = vsnprintf(NULL, 0, fmt, ap);
    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
    std::vector<char> buf(size + 1);
    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
    LLAMA_ASSERT(size2 == size);
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
 }
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    size_t size;
    llama_file(const char * fname, const char * mode) {
        fp = std::fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
        seek(0, SEEK_SET);
    }
    size_t tell() const {
 #ifdef _WIN32
        __int64 ret = _ftelli64(fp);
 #else
        long ret = std::ftell(fp);
 #endif
        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
        return (size_t) ret;
    }
    void seek(size_t offset, int whence) {
 #ifdef _WIN32
        int ret = _fseeki64(fp, (__int64) offset, whence);
 #else
        int ret = std::fseek(fp, (long) offset, whence);
 #endif
        LLAMA_ASSERT(ret == 0); // same
    }
    void read_raw(void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
        std::size_t ret = std::fread(ptr, len, 1, fp);
        if (ferror(fp)) {
            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
        if (ret != 1) {
            throw std::runtime_error(std::string("unexpectedly reached end of file"));
        }
    }
    std::uint32_t read_u32() {
        std::uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
    }
    std::string read_string(std::uint32_t len) {
        std::vector<char> chars(len);
        read_raw(chars.data(), len);
        return std::string(chars.data(), len);
    }
    void write_raw(const void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
        size_t ret = std::fwrite(ptr, len, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
    }
    void write_u32(std::uint32_t val) {
        write_raw(&val, sizeof(val));
    }
    ~llama_file() {
        if (fp) {
            std::fclose(fp);
        }
    }
 };
 // llama_context_data
 struct llama_data_context {
    virtual void write(const void * src, size_t size) = 0;
    virtual size_t get_size_written() = 0;
    virtual ~llama_data_context() = default;
 };
 struct llama_data_buffer_context : llama_data_context {
    uint8_t* ptr;
    size_t size_written = 0;
    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
    void write(const void * src, size_t size) override {
        memcpy(ptr, src, size);
        ptr += size;
        size_written += size;
    }
    size_t get_size_written() override {
        return size_written;
    }
 };
 struct llama_data_file_context : llama_data_context {
    llama_file* file;
    size_t size_written = 0;
    llama_data_file_context(llama_file * f) : file(f) {}
    void write(const void * src, size_t size) override {
        file->write_raw(src, size);
        size_written += size;
    }
    size_t get_size_written() override {
        return size_written;
    }
 };
 #if defined(_WIN32)
 static std::string llama_format_win_err(DWORD err) {
    LPSTR buf;
    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
    if (!size) {
        return "FormatMessageA failed";
    }
    std::string ret(buf, size);
    LocalFree(buf);
    return ret;
 }
 #endif
 struct llama_mmap {
    void * addr;
    size_t size;
    llama_mmap(const llama_mmap &) = delete;
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;
    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
        // prefetch/readahead impairs performance on NUMA systems
        if (numa) { prefetch = 0; }
 #ifdef __linux__
        if (prefetch >= file->size) { flags |= MAP_POPULATE; }
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        if (addr == MAP_FAILED) {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }
        if (prefetch > 0) {
            // Advise the kernel to preload the mapped memory
            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
        if (numa) {
            // advise the kernel not to use readahead
            // (because the next page might not belong on the same node)
            if (madvise(addr, file->size, MADV_RANDOM)) {
                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
                        strerror(errno));
            }
        }
    }
    ~llama_mmap() {
        munmap(addr, size);
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
        (void) numa;
        size = file->size;
        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
        if (hMapping == NULL) {
            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
        }
        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
        error = GetLastError();
        CloseHandle(hMapping);
        if (addr == NULL) {
            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
        }
        if (prefetch) {
            // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
            // will dynamically load it using GetProcAddress.
            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
            HMODULE hKernel32;
            // This call is guaranteed to succeed.
            hKernel32 = GetModuleHandleW(L"kernel32.dll");
            // This call may fail if on a pre-Win8 system.
            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
            if (pPrefetchVirtualMemory) {
                // Advise the kernel to preload the mapped memory.
                WIN32_MEMORY_RANGE_ENTRY range;
                range.VirtualAddress = addr;
                range.NumberOfBytes = (SIZE_T)size;
                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
                            llama_format_win_err(GetLastError()).c_str());
                }
            }
        }
    }
    ~llama_mmap() {
        if (!UnmapViewOfFile(addr)) {
            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
    }
 #else
    static constexpr bool SUPPORTED = false;
    llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
        (void) prefetch;
        (void) numa;
        throw std::runtime_error(std::string("mmap not supported"));
    }
 #endif
 };
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
 struct llama_mlock {
    void * addr = NULL;
    size_t size = 0;
    bool failed_already = false;
    llama_mlock() {}
    llama_mlock(const llama_mlock &) = delete;
    ~llama_mlock() {
        if (size) {
            raw_unlock(addr, size);
        }
    }
    void init(void * ptr) {
        LLAMA_ASSERT(addr == NULL && size == 0);
        addr = ptr;
    }
    void grow_to(size_t target_size) {
        LLAMA_ASSERT(addr);
        if (failed_already) {
            return;
        }
        size_t granularity = lock_granularity();
        target_size = (target_size + granularity - 1) & ~(granularity - 1);
        if (target_size > size) {
            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
                size = target_size;
            } else {
                failed_already = true;
            }
        }
    }
 #ifdef _POSIX_MEMLOCK_RANGE
    static constexpr bool SUPPORTED = true;
    size_t lock_granularity() {
        return (size_t) sysconf(_SC_PAGESIZE);
    }
    #ifdef __APPLE__
        #define MLOCK_SUGGESTION \
            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
    #else
        #define MLOCK_SUGGESTION \
            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
    #endif
    bool raw_lock(const void * addr, size_t size) {
        if (!mlock(addr, size)) {
            return true;
        } else {
            char* errmsg = std::strerror(errno);
            bool suggest = (errno == ENOMEM);
            // Check if the resource limit is fine after all
            struct rlimit lock_limit;
            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
                suggest = false;
            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
                suggest = false;
            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
    #undef MLOCK_SUGGESTION
    void raw_unlock(void * addr, size_t size) {
        if (munlock(addr, size)) {
            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
        }
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
    size_t lock_granularity() {
        SYSTEM_INFO si;
        GetSystemInfo(&si);
        return (size_t) si.dwPageSize;
    }
    bool raw_lock(void * ptr, size_t len) {
        for (int tries = 1; ; tries++) {
            if (VirtualLock(ptr, len)) {
                return true;
            }
            if (tries == 2) {
                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
                    len, size, llama_format_win_err(GetLastError()).c_str());
                return false;
            }
            // It failed but this was only the first try; increase the working
            // set size and try again.
            SIZE_T min_ws_size, max_ws_size;
            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
                return false;
            }
            // Per MSDN: "The maximum number of pages that a process can lock
            // is equal to the number of pages in its minimum working set minus
            // a small overhead."
            // Hopefully a megabyte is enough overhead:
            size_t increment = len + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
            min_ws_size += increment;
            max_ws_size += increment;
            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
                return false;
            }
        }
    }
    void raw_unlock(void * ptr, size_t len) {
        if (!VirtualUnlock(ptr, len)) {
            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
    }
 #else
    static constexpr bool SUPPORTED = false;
    size_t lock_granularity() {
        return (size_t) 65536;
    }
    bool raw_lock(const void * addr, size_t len) {
        fprintf(stderr, "warning: mlock not supported on this system\n");
        return false;
    }
    void raw_unlock(const void * addr, size_t len) {}
 #endif
 };
 // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
 struct llama_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
    llama_buffer() = default;
    void resize(size_t len) {
 #ifdef GGML_USE_METAL
        free(addr);
        int result = posix_memalign((void **) &addr, getpagesize(), len);
        if (result == 0) {
            memset(addr, 0, len);
        }
        else {
            addr = NULL;
        }
 #else
        delete[] addr;
        addr = new uint8_t[len];
 #endif
        size = len;
    }
    ~llama_buffer() {
 #ifdef GGML_USE_METAL
        free(addr);
 #else
        delete[] addr;
 #endif
        addr = NULL;
    }
    // disable copy and move
    llama_buffer(const llama_buffer&) = delete;
    llama_buffer(llama_buffer&&) = delete;
    llama_buffer& operator=(const llama_buffer&) = delete;
    llama_buffer& operator=(llama_buffer&&) = delete;
 };
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
    uint8_t * addr = NULL;
    bool is_cuda;
    size_t size = 0;
    llama_ctx_buffer() = default;
    void resize(size_t size) {
        free();
        addr = (uint8_t *) ggml_cuda_host_malloc(size);
        if (addr) {
            is_cuda = true;
        }
        else {
            // fall back to pageable memory
            addr = new uint8_t[size];
            is_cuda = false;
        }
        this->size = size;
    }
    void free() {
        if (addr) {
            if (is_cuda) {
                ggml_cuda_host_free(addr);
            }
            else {
                delete[] addr;
            }
        }
        addr = NULL;
    }
    ~llama_ctx_buffer() {
        free();
    }
    // disable copy and move
    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
 };
 #else
 typedef llama_buffer llama_ctx_buffer;
 #endif
 #endif
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -34,29 +34,18 @@
 #    define DEPRECATED(func, hint) func
 #endif
-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
 #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
 #define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
-#define LLAMA_FILE_VERSION           3
+#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
 #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
-#define LLAMA_DEFAULT_SEED           0xFFFFFFFF
+#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 1
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
 #ifndef LLAMA_DEFAULT_RMS_EPS
 #define LLAMA_DEFAULT_RMS_EPS 5e-6f
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -72,6 +61,50 @@ extern "C" {
    typedef int llama_token;
    enum llama_log_level {
        LLAMA_LOG_LEVEL_ERROR = 2,
        LLAMA_LOG_LEVEL_WARN  = 3,
        LLAMA_LOG_LEVEL_INFO  = 4
    };
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
    };
    enum llama_token_type {
        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
        LLAMA_TOKEN_TYPE_NORMAL       = 1,
        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
        LLAMA_TOKEN_TYPE_CONTROL      = 3,
        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
        LLAMA_TOKEN_TYPE_UNUSED       = 5,
        LLAMA_TOKEN_TYPE_BYTE         = 6,
    };
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32              = 0,
        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
    };
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@ -86,25 +119,10 @@ extern "C" {
    typedef void (*llama_progress_callback)(float progress, void *ctx);
    enum llama_log_level {
        LLAMA_LOG_LEVEL_ERROR = 2,
        LLAMA_LOG_LEVEL_WARN  = 3,
        LLAMA_LOG_LEVEL_INFO  = 4
    };
    // Signature for logging events
    // Note that text includes the new line character at the end for most events.
    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
    // if it exists.
    // It might not exist for progress report where '.' is output repeatedly.
    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
    struct llama_context_params {
        uint32_t seed;         // RNG seed, -1 for random
        int32_t  n_ctx;        // text context
        int32_t  n_batch;      // prompt processing batch size
        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
        int32_t  n_gpu_layers; // number of layers to store in VRAM
        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
@ -129,33 +147,18 @@ extern "C" {
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only
    };
-    // model file types
+
-    enum llama_ftype {
+    // Signature for logging events
-        LLAMA_FTYPE_ALL_F32              = 0,
+    // Note that text includes the new line character at the end for most events.
-        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+    // if it exists.
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+    // It might not exist for progress report where '.' is output repeatedly.
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
    };
    // model quantization parameters
    typedef struct llama_model_quantize_params {
        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
    } llama_model_quantize_params;
@ -208,27 +211,16 @@ extern "C" {
        int32_t n_eval;
    };
-    // Set callback for all future logging events.
+    LLAMA_API struct llama_context_params llama_context_default_params(void);
-    // If this is not called, or NULL is supplied, everything is output on stderr.
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
    LLAMA_API int llama_max_devices();
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();
    // TODO: not great API - very likely to change
    // Initialize the llama + ggml backend
    // If numa is true, use NUMA optimizations
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(bool numa);
    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free();
-    LLAMA_API int64_t llama_time_us();
+    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);
    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
@ -240,17 +232,26 @@ extern "C" {
                     struct llama_model * model,
            struct llama_context_params   params);
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
                             const char * path_model,
            struct llama_context_params   params),
            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
    LLAMA_API int64_t llama_time_us(void);
    LLAMA_API int  llama_max_devices    (void);
    LLAMA_API bool llama_mmap_supported (void);
    LLAMA_API bool llama_mlock_supported(void);
    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
    LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
    LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
    LLAMA_API int llama_model_n_embd (const struct llama_model * model);
    // Get a string describing the model type
    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
@ -272,9 +273,9 @@ extern "C" {
    LLAMA_API int llama_model_apply_lora_from_file(
            const struct llama_model * model,
-                      const char * path_lora,
+                          const char * path_lora,
-                      const char * path_base_model,
+                          const char * path_base_model,
-                             int   n_threads);
+                                 int   n_threads);
    // Returns the number of tokens in the KV cache
    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
@ -324,11 +325,40 @@ extern "C" {
    // IMPORTANT: do not use for anything else other than debugging and testing!
    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
    // Rows: n_tokens
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
    // Get the embeddings for the input
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
    //
    // Vocab
    //
    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
    LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
    // Special tokens
    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
    //
    // Tokenization
    //
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
    // Returns a negative number on failure - the number of tokens that would have been returned
    // TODO: not sure if correct
    LLAMA_API int llama_tokenize(
            struct llama_context * ctx,
                      const char * text,
@ -336,6 +366,13 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);
    LLAMA_API int llama_tokenize_bpe(
            struct llama_context * ctx,
                      const char * text,
                     llama_token * tokens,
                             int   n_max_tokens,
                            bool   add_bos);
    LLAMA_API int llama_tokenize_with_model(
        const struct llama_model * model,
                      const char * text,
@ -343,55 +380,30 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);
    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
    // Get the vocabulary as output parameters.
    // Returns number of results.
    LLAMA_API int llama_get_vocab(
            const struct llama_context * ctx,
                          const char * * strings,
                                 float * scores,
                                   int   capacity);
    LLAMA_API int llama_get_vocab_from_model(
              const struct llama_model * model,
                          const char * * strings,
                                 float * scores,
                                   int   capacity);
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
    // Rows: n_tokens
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
    // Get the embeddings for the input
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(
+    // Does not write null terminator to the buffer
    LLAMA_API int llama_token_to_str(
            const struct llama_context * ctx,
-                           llama_token   token);
+                           llama_token   token,
                                  char * buf,
                                  int    length);
-    LLAMA_API const char * llama_token_to_str_with_model(
+    LLAMA_API int llama_token_to_str_bpe(
            const struct llama_context * ctx,
                           llama_token   token,
                                  char * buf,
                                  int    length);
    LLAMA_API int llama_token_to_str_with_model(
              const struct llama_model * model,
-                           llama_token   token);
+                           llama_token   token,
-
+                                  char * buf,
-    // Special tokens
+                                  int    length);
    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
    LLAMA_API llama_token llama_token_nl();   // next-line
    //
    // Grammar
    //
    LLAMA_API struct llama_grammar * llama_grammar_init(
            const llama_grammar_element ** rules,
                                 size_t    n_rules,
@ -399,7 +411,9 @@ extern "C" {
    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
    //
    // Sampling functions
    //
    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
@ -468,6 +482,10 @@ extern "C" {
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
 #ifdef __cplusplus
 }
 #endif
@ -477,10 +495,11 @@ extern "C" {
 #include <vector>
 #include <string>
 struct ggml_tensor;
 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-#endif
+#endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H
--- a/models/.editorconfig
+++ b/models/.editorconfig
@ -0,0 +1 @@
 root = true
--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/models/ggml-vocab.bin
+++ b/models/ggml-vocab.bin
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,16 +1,36 @@
-function(llama_add_test source)
+function(llama_build_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE llama)
+    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
 endfunction()
 function(llama_test_executable name source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    # add_executable(${TEST_TARGET} ${source})
    # install(TARGETS ${TEST_TARGET} RUNTIME)
    # target_link_libraries(${TEST_TARGET} PRIVATE llama)
    add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
 function(llama_build_and_test_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
-# llama_add_test(test-double-float.cpp) # SLOW
+# llama_build_and_test_executable(test-double-float.cpp) # SLOW
-llama_add_test(test-quantize-fns.cpp)
+llama_build_and_test_executable(test-quantize-fns.cpp)
-llama_add_test(test-quantize-perf.cpp)
+llama_build_and_test_executable(test-quantize-perf.cpp)
-llama_add_test(test-sampling.cpp)
+llama_build_and_test_executable(test-sampling.cpp)
-llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+llama_build_executable(test-tokenizer-0.cpp)
-llama_add_test(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
+llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_add_test(test-grad0.cpp) # SLOW
+llama_build_executable(test-tokenizer-1.cpp)
-# llama_add_test(test-opt.cpp) # SLOW
+llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 #llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@ -3,7 +3,8 @@
 #endif
 #include "llama.h"
-#include "examples/grammar-parser.cpp"
+#include "grammar-parser.h"
 #include <cassert>
 int main()
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -0,0 +1,403 @@
 #ifdef NDEBUG
 #undef NDEBUG
 #endif
 #include "llama.cpp" // TODO: not great
 #include "grammar-parser.h"
 #include <cassert>
 int main()
 {
    grammar_parser::parse_state parsed_grammar;
    std::vector<std::pair<std::string, uint32_t>> expected = {
        {"expr", 2},
        {"expr_6", 6},
        {"expr_7", 7},
        {"ident", 8},
        {"ident_10", 10},
        {"num", 9},
        {"num_11", 11},
        {"root", 0},
        {"root_1", 1},
        {"root_5", 5},
        {"term", 4},
        {"ws", 3},
        {"ws_12", 12},
    };
    std::vector<std::vector<llama_grammar_element>> expected_rules = {
        {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
        {
            {LLAMA_GRETYPE_RULE_REF, 2},
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_RULE_REF, 4},
            {LLAMA_GRETYPE_CHAR, 10},
            {LLAMA_GRETYPE_END, 0},
        },
        {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
        {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
        {
            {LLAMA_GRETYPE_RULE_REF, 8},
            {LLAMA_GRETYPE_ALT, 0},
            {LLAMA_GRETYPE_RULE_REF, 9},
            {LLAMA_GRETYPE_ALT, 0},
            {LLAMA_GRETYPE_CHAR, 40},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_RULE_REF, 2},
            {LLAMA_GRETYPE_CHAR, 41},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_END, 0},
        },
        {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
        {
            {LLAMA_GRETYPE_CHAR, 45},
            {LLAMA_GRETYPE_CHAR_ALT, 43},
            {LLAMA_GRETYPE_CHAR_ALT, 42},
            {LLAMA_GRETYPE_CHAR_ALT, 47},
            {LLAMA_GRETYPE_RULE_REF, 4},
            {LLAMA_GRETYPE_END, 0},
        },
        {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
        {
            {LLAMA_GRETYPE_CHAR, 97},
            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
            {LLAMA_GRETYPE_RULE_REF, 10},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_END, 0},
        },
        {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
        {
            {LLAMA_GRETYPE_CHAR, 97},
            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
            {LLAMA_GRETYPE_CHAR_ALT, 48},
            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
            {LLAMA_GRETYPE_CHAR_ALT, 95},
            {LLAMA_GRETYPE_RULE_REF, 10},
            {LLAMA_GRETYPE_ALT, 0},
            {LLAMA_GRETYPE_END, 0},
        },
        {
            {LLAMA_GRETYPE_CHAR, 48},
            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
            {LLAMA_GRETYPE_RULE_REF, 11},
            {LLAMA_GRETYPE_ALT, 0},
            {LLAMA_GRETYPE_CHAR, 48},
            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
            {LLAMA_GRETYPE_END, 0},
        },
        {
            {LLAMA_GRETYPE_CHAR, 32},
            {LLAMA_GRETYPE_CHAR_ALT, 9},
            {LLAMA_GRETYPE_CHAR_ALT, 10},
            {LLAMA_GRETYPE_RULE_REF, 12},
            {LLAMA_GRETYPE_ALT, 0},
            {LLAMA_GRETYPE_END, 0},
        },
    };
    for (auto pair : expected)
    {
        parsed_grammar.symbol_ids[pair.first] = pair.second;
    }
    for (auto rule : expected_rules)
    {
        parsed_grammar.rules.push_back({});
        for (auto element : rule)
        {
            parsed_grammar.rules.back().push_back(element);
        }
    }
    llama_grammar *grammar = NULL;
    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
    grammar = llama_grammar_init(
        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
        {
            {LLAMA_GRETYPE_RULE_REF, 5},
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_CHAR, 97},
        },
        {
            {LLAMA_GRETYPE_RULE_REF, 5},
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_CHAR, 48},
        },
        {
            {LLAMA_GRETYPE_RULE_REF, 5},
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_CHAR, 48},
        },
        {
            {LLAMA_GRETYPE_RULE_REF, 5},
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_CHAR, 40},
        },
        {
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_CHAR, 97},
        },
        {
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_CHAR, 48},
        },
        {
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_RULE_REF, 3},
            {LLAMA_GRETYPE_CHAR, 48},
        },
        {
            {LLAMA_GRETYPE_CHAR, 61},
            {LLAMA_GRETYPE_RULE_REF, 7},
            {LLAMA_GRETYPE_CHAR, 40},
        }};
    auto index = 0;
    for (auto stack : grammar->stacks)
    {
        // compare stack to expected_stack
        for (uint32_t i = 0; i < stack.size(); i++)
        {
            auto element = stack[i];
            auto expected_element = expected_stacks[index][i];
            // pretty print error message before asserting
            if (expected_element.type != element->type || expected_element.value != element->value)
            {
                fprintf(stderr, "index: %d\n", index);
                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
                fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
                fprintf(stderr, "expected_element != actual_element\n");
            }
            assert(expected_element.type == element->type && expected_element.value == element->value);
        }
        index++;
    }
    std::vector<std::vector<const llama_grammar_element *>> next_stacks;
    std::vector<llama_grammar_candidate> next_candidates;
    next_candidates.resize(24);
    for (size_t i = 0; i < 24; ++i)
    {
        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
        cp[0] = 37 + i;
        cp[1] = 0;
        next_candidates[i] = {i, cp, {}};
    }
    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {3, 40},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {11, 48},
            {12, 49},
            {13, 50},
            {14, 51},
            {15, 52},
            {16, 53},
            {17, 54},
            {18, 55},
            {19, 56},
            {20, 57},
            {21, 58},
            {22, 59},
            {23, 60},
        },
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {3, 40},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {21, 58},
            {22, 59},
            {23, 60},
        },
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {3, 40},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {21, 58},
            {22, 59},
            {23, 60},
        },
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {11, 48},
            {12, 49},
            {13, 50},
            {14, 51},
            {15, 52},
            {16, 53},
            {17, 54},
            {18, 55},
            {19, 56},
            {20, 57},
            {21, 58},
            {22, 59},
            {23, 60},
        },
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {3, 40},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {11, 48},
            {12, 49},
            {13, 50},
            {14, 51},
            {15, 52},
            {16, 53},
            {17, 54},
            {18, 55},
            {19, 56},
            {20, 57},
            {21, 58},
            {22, 59},
            {23, 60},
        },
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {3, 40},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {21, 58},
            {22, 59},
            {23, 60},
        },
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {3, 40},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {21, 58},
            {22, 59},
            {23, 60},
        },
        {
            {0, 37},
            {1, 38},
            {2, 39},
            {4, 41},
            {5, 42},
            {6, 43},
            {7, 44},
            {8, 45},
            {9, 46},
            {10, 47},
            {11, 48},
            {12, 49},
            {13, 50},
            {14, 51},
            {15, 52},
            {16, 53},
            {17, 54},
            {18, 55},
            {19, 56},
            {20, 57},
            {21, 58},
            {22, 59},
            {23, 60},
        },
    };
    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
    std::vector<std::vector<llama_grammar_candidate>> all_rejects;
    for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
    {
        rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
        all_rejects.push_back(rejects);
    }
    index = 0;
    for (auto rej : all_rejects)
    {
        for (uint32_t i = 0; i < rej.size(); i++)
        {
            auto element = rej[i];
            auto expected_element = expected_reject[index][i];
            assert(element.index == expected_element.first && *element.code_points == expected_element.second);
        }
        index++;
    }
    for (auto &candidate : next_candidates)
    {
        delete[] candidate.code_points;
        candidate.code_points = nullptr;
    }
    delete grammar;
    return 0;
 }
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -1,22 +1,47 @@
 #include "llama.h"
 #include "common.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
-static const std::map<std::string, std::vector<llama_token>> & k_tests()
+static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
-{
+    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        result += llama_token_to_str(ctx, tokens[i]);
    }
    return result;
 }
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { "Hello World",        { 1,  10994,   2787, }, },
+        { " ",                      {1,    259, }, },
-        { " Hello World",       { 1,  15043,   2787, }, },
+        { "\t",                     { 1,    29871,   12, }, },
-        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
+        { "\n",                     { 1,    29871,   13, }, },
-        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        { "\t\n",                   { 1,    29871,   12,     13, }, },
-        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        { "Hello world",            { 1,  15043,   3186, }, },
-        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+        { " Hello world",           { 1,  29871,  15043,   3186, }, },
        { "Hello World",            { 1,  15043,   2787, }, },
        { " Hello World",           { 1,  29871,  15043,   2787, }, },
        { " Hello World!",          { 1,  29871,  15043,   2787,  29991, }, },
        { " this is 🦙.cpp",        { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
        { "w048 7tuijk dsdfhu",     { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
        { "нещо на Български",      { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
        { "កាន់តែពិសេសអាចខលចេញ",   { 1,  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
                                     146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
                                     31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
                                     161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
                                     136,    228,    162,    132,    228,    161,    140, }, },
        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
            { 1,  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
                243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
                313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
                313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
    };
    return _k_tests;
-};
+}
 int main(int argc, char **argv) {
    if (argc < 2) {
@ -64,10 +89,12 @@ int main(int argc, char **argv) {
        return 2;
    }
    bool success = true;
    for (const auto & test_kv : k_tests()) {
-        std::vector<llama_token> res(test_kv.first.size());
+        std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
-        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
+        fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
-        res.resize(n);
+            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
        bool correct = res.size() == test_kv.second.size();
@ -78,7 +105,8 @@ int main(int argc, char **argv) {
        }
        if (!correct) {
-            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
@ -90,9 +118,7 @@ int main(int argc, char **argv) {
            }
            fprintf(stderr, "\n");
-            llama_free_model(model);
+            success = false;
            llama_free(ctx);
            return 3;
        }
    }
@ -101,5 +127,5 @@ int main(int argc, char **argv) {
    llama_backend_free();
-    return 0;
+    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@ -0,0 +1,131 @@
 #include "llama.h"
 #include "common.h"
 #include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <string>
 #include <codecvt>
 #include <map>
 #include <vector>
 #include <locale>
 static std::string escape_whitespace(const std::string& text) {
    std::string result;
    bool escaping = false;
    result += "\xe2\x96\x81";
    for (size_t offs = 0; offs < text.length(); ++offs) {
        if (text[offs] == ' ') {
            if (!escaping) {
                result += "\xe2\x96\x81";
                escaping = true;
            }
        }
        else {
            escaping = false;
            result += text[offs];
        }
    }
    return result;
 }
 static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        result += llama_token_to_str(ctx, tokens[i]);
    }
    return result;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init(false);
    // load the vocab
    {
        auto lparams = llama_context_default_params();
        lparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), lparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        ctx = llama_new_context_with_model(model, lparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
    const int n_vocab = llama_n_vocab(ctx);
    for (int i = 0; i < n_vocab; ++i) {
        std::string forward = llama_token_to_str_bpe(ctx, i);
        std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
        if (tokens.size() == 1) {
            if (i != tokens[0]) {
                std::string backward = llama_token_to_str(ctx, tokens[0]);
                fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
                    __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
                return 2;
            }
        } else {
            llama_token_type type = llama_token_get_type(ctx, i);
            if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
                fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
            } else {
                fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
                return 2;
            }
        }
    }
 #ifdef _WIN32
    std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
    for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
        std::u16string u16str(1, ch);
        std::string str = u16converter.to_bytes(u16str);
        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
        if (tokens.size() == 1) {
            fprintf(stderr, "%s : info: %s tokenized to %d \n",
                __func__, str.c_str(), tokens[0]);
        }
    }
    std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
    for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
        std::u32string u32str(1, ch);
        std::string str = u32converter.to_bytes(u32str);
        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
        if (tokens.size() == 1) {
            fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
        }
    }
 #endif
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    return 0;
 }