Merge commit '469c9addef' into nomic-vulkan

2023-11-14 12:00:37 -05:00 · 2023-11-14 12:00:37 -05:00 · 2a41ba7258
commit 2a41ba7258
parent a934b2cb8a 469c9addef
102 changed files with 19073 additions and 8584 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,24 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
 else()
+    add_subdirectory(baby-llama)
+    add_subdirectory(batched)
+    add_subdirectory(batched-bench)
+    add_subdirectory(beam-search)
+    add_subdirectory(benchmark)
+    add_subdirectory(convert-llama2c-to-ggml)
+    add_subdirectory(embedding)
+    add_subdirectory(finetune)
+    add_subdirectory(infill)
+    add_subdirectory(llama-bench)
+    add_subdirectory(llava)
    add_subdirectory(main)
+    add_subdirectory(parallel)
+    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
-    add_subdirectory(perplexity)
-    add_subdirectory(embedding)
    add_subdirectory(save-load-state)
-    add_subdirectory(benchmark)
-    add_subdirectory(baby-llama)
-    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(finetune)
-    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
-    add_subdirectory(batched)
    add_subdirectory(speculative)
-    add_subdirectory(parallel)
-    add_subdirectory(embd-input)
-    add_subdirectory(llama-bench)
-    add_subdirectory(beam-search)
+    add_subdirectory(train-text-from-scratch)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET batched-bench)
+add_executable(${TARGET} batched-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -0,0 +1,51 @@
+# llama.cpp/example/batched-bench
+
+Benchmark the batched decoding performance of `llama.cpp`
+
+## Usage
+
+There are 2 modes of operation:
+
+- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
+- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
+
+```bash
+./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+
+# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
+
+# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
+
+# custom set of batches
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+```
+
+## Sample results
+
+- `PP` - prompt tokens per batch
+- `TG` - generated tokens per batch
+- `B` - number of batches
+- `N_KV` - required KV cache size
+- `T_PP` - prompt processing time (i.e. time to first token)
+- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
+- `T_TG` - time to generate all batches
+- `S_TG` - text generation speed (`(B*TG)/T_TG`)
+- `T` - total time
+- `S` - total speed (i.e. all tokens / total time)
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
+|   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
+|   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
+|   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
+|   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
+|   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
+|   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
+|   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
+|   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
+|   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
+|   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
+|   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -0,0 +1,243 @@
+#include "common.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// mutates the input string
+static std::vector<int> parse_list(char * p) {
+    std::vector<int> ret;
+
+    char * q = p;
+
+    while (*p) {
+        if (*p == ',') {
+            *p = '\0';
+            ret.push_back(std::atoi(q));
+            q = p + 1;
+        }
+
+        ++p;
+    }
+
+    ret.push_back(std::atoi(q));
+
+    return ret;
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
+        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
+        printf("  example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        return 1 ;
+    }
+
+    int n_kv_max     = 2048;
+    int is_pp_shared = 0;
+    int n_gpu_layers = 0;
+    int mmq          = 0;
+
+    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
+    std::vector<int> n_tg = { 128, 256, };
+    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
+    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
+
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+
+    if (argc >= 3) {
+        n_kv_max = std::atoi(argv[2]);
+    }
+
+    if (argc >= 4) {
+        is_pp_shared = std::atoi(argv[3]);
+    }
+
+    if (argc >= 5) {
+        n_gpu_layers = std::atoi(argv[4]);
+    }
+
+    if (argc >= 6) {
+        mmq = std::atoi(argv[5]);
+    }
+
+    if (argc >= 7) {
+        n_pp = parse_list(argv[6]);
+    }
+
+    if (argc >= 8) {
+        n_tg = parse_list(argv[7]);
+    }
+
+    if (argc >= 9) {
+        n_pl = parse_list(argv[8]);
+    }
+
+    // init LLM
+
+    llama_backend_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = llama_model_default_params();
+
+    model_params.n_gpu_layers = n_gpu_layers;
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.seed      = 1234;
+    ctx_params.n_ctx     = n_kv_max;
+    ctx_params.n_batch   = 512;
+    ctx_params.mul_mat_q = mmq;
+
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
+
+    // decode in batches of ctx_params.n_batch tokens
+    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+                0, 0, 0, // unused
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0) {
+                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                return false;
+            }
+        }
+
+        return true;
+    };
+
+    // warm up
+    {
+        for (int i = 0; i < 16; ++i) {
+            llama_batch_add(batch, 0, i, { 0 }, false);
+        }
+
+        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+    }
+
+    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
+    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+
+    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
+        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
+            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
+                const int pp = n_pp[i_pp];
+                const int tg = n_tg[i_tg];
+                const int pl = n_pl[i_pl];
+
+                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
+
+                if (n_ctx_req > n_kv_max) {
+                    continue;
+                }
+
+                llama_batch_clear(batch);
+
+                const int n_tokens = is_pp_shared ? pp : pl*pp;
+
+                for (int i = 0; i < n_tokens; ++i) {
+                    llama_batch_add(batch, 0, i, { 0 }, false);
+                }
+                batch.logits[batch.n_tokens - 1] = true;
+
+                const auto t_pp_start = ggml_time_us();
+
+                llama_kv_cache_tokens_rm(ctx, -1, -1);
+
+                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    return 1;
+                }
+
+                if (is_pp_shared) {
+                    for (int32_t i = 1; i < pl; ++i) {
+                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
+                    }
+                }
+
+                const auto t_pp_end = ggml_time_us();
+
+                const auto t_tg_start = ggml_time_us();
+
+                for (int i = 0; i < tg; ++i) {
+                    llama_batch_clear(batch);
+
+                    for (int j = 0; j < pl; ++j) {
+                        llama_batch_add(batch, 0, pp + i, { j }, true);
+                    }
+
+                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        return 1;
+                    }
+                }
+
+                const auto t_tg_end = ggml_time_us();
+
+                const int32_t n_kv = n_ctx_req;
+
+                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
+                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
+                const float t    = t_pp + t_tg;
+
+                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
+                const float speed_tg = pl*tg / t_tg;
+                const float speed    = n_kv / t;
+
+                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+            }
+        }
+    }
+
+    llama_print_timings(ctx);
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
--- a/examples/batched.swift/.gitignore
+++ b/examples/batched.swift/.gitignore
@ -0,0 +1,9 @@
+.DS_Store
+/.build
+/Packages
+xcuserdata/
+DerivedData/
+.swiftpm/configuration/registries.json
+.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
+.netrc
+batched_swift
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@ -0,0 +1,6 @@
+.PHONY: build
+
+build:
+	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
+	rm -f ./batched_swift
+	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@ -0,0 +1,22 @@
+// swift-tools-version: 5.5
+// The swift-tools-version declares the minimum version of Swift required to build this package.
+
+import PackageDescription
+
+let package = Package(
+    name: "batched_swift",
+    platforms: [.macOS(.v12)],
+    dependencies: [
+        .package(name: "llama", path: "../../"),
+    ],
+    targets: [
+        // Targets are the basic building blocks of a package, defining a module or a test suite.
+        // Targets can depend on other targets in this package and products from dependencies.
+        .executableTarget(
+            name: "batched_swift",
+            dependencies: ["llama"],
+            path: "Sources",
+            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
+        ),
+    ]
+)
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@ -0,0 +1,4 @@
+This is a swift clone of `examples/batched`.
+
+$ `make`
+$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -0,0 +1,263 @@
+import Foundation
+import llama
+
+let arguments = CommandLine.arguments
+
+// Check that we have at least one argument (the model path)
+guard arguments.count > 1 else {
+    print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
+    exit(1)
+}
+
+let modelPath: String = arguments[1]
+let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
+let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
+
+// total length of the sequences including the prompt
+let n_len: Int = 32
+
+// init LLM
+llama_backend_init(false)
+defer {
+    llama_backend_free()
+}
+
+let model_params = llama_model_default_params()
+guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
+    print("Failed to load model")
+    exit(1)
+}
+
+defer {
+    llama_free_model(model)
+}
+
+var tokens = tokenize(text: prompt, add_bos: true)
+
+let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
+
+var context_params = llama_context_default_params()
+context_params.seed = 1234
+context_params.n_ctx = n_kv_req
+context_params.n_batch = UInt32(max(n_len, n_parallel))
+context_params.n_threads = 8
+context_params.n_threads_batch = 8
+
+let context = llama_new_context_with_model(model, context_params)
+guard context != nil else {
+    print("Failed to initialize context")
+    exit(1)
+}
+
+defer {
+    llama_free(context)
+}
+
+let n_ctx = llama_n_ctx(context)
+
+print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
+
+if n_kv_req > n_ctx {
+    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
+    exit(1)
+}
+
+var buffer: [CChar] = []
+for id: llama_token in tokens {
+    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
+}
+
+print("\n")
+
+var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
+defer {
+    llama_batch_free(batch)
+}
+
+// evaluate the initial prompt
+batch.n_tokens = Int32(tokens.count)
+
+for (i, token) in tokens.enumerated() {
+    batch.token[i] = token
+    batch.pos[i] = Int32(i)
+    batch.n_seq_id[i] = 1
+    // batch.seq_id[i][0] = 0
+    // TODO: is this the proper way to do this?
+    if let seq_id = batch.seq_id[i] {
+        seq_id[0] = 0
+    }
+    batch.logits[i] = 0
+}
+
+// llama_decode will output logits only for the last token of the prompt
+batch.logits[Int(batch.n_tokens) - 1] = 1
+
+if llama_decode(context, batch) != 0 {
+    print("llama_decode() failed")
+    exit(1)
+}
+
+for i in 1 ..< n_parallel {
+    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+}
+
+if n_parallel > 1 {
+    print("generating \(n_parallel) sequences ...\n")
+}
+
+var streams: [String] = .init(repeating: "", count: n_parallel)
+var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
+var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
+
+var n_cur = batch.n_tokens
+var n_decode = 0
+
+let t_main_start = ggml_time_us()
+
+while n_cur <= n_len {
+    // prepare the next batch
+    batch.n_tokens = 0
+
+    // sample the next token for each parallel sequence / stream
+    for i in 0 ..< n_parallel {
+        if i_batch[i] < 0 {
+            // the stream has already finished
+            continue
+        }
+
+        var n_vocab = llama_n_vocab(model)
+        var logits = llama_get_logits_ith(context, i_batch[i])
+
+        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
+
+        for token_id in 0 ..< n_vocab {
+            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
+        }
+
+        var candidates_p: llama_token_data_array = .init(
+            data: &candidates,
+            size: candidates.count,
+            sorted: false
+        )
+
+        let top_k: Int32 = 40
+        let top_p: Float = 0.9
+        let temp: Float = 0.4
+
+        llama_sample_top_k(context, &candidates_p, top_k, 1)
+        llama_sample_top_p(context, &candidates_p, top_p, 1)
+        llama_sample_temp(context, &candidates_p, temp)
+
+        let new_token_id = llama_sample_token(context, &candidates_p)
+
+        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+        // is it an end of stream? -> mark the stream as finished
+        if new_token_id == llama_token_eos(context) || n_cur == n_len {
+            i_batch[i] = -1
+            // print("")
+            if n_parallel > 1 {
+                print("stream \(i) finished at n_cur = \(n_cur)")
+            }
+
+            continue
+        }
+
+        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
+
+        // if there is only one stream, we print immediately to stdout
+        if n_parallel == 1 {
+            print(nextStringPiece, terminator: "")
+        }
+        streams[i] += nextStringPiece
+
+        // push this new token for next evaluation
+        batch.token[Int(batch.n_tokens)] = new_token_id
+        batch.pos[Int(batch.n_tokens)] = n_cur
+        batch.n_seq_id[Int(batch.n_tokens)] = 1
+        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
+            seq_id[0] = Int32(i)
+        }
+        batch.logits[Int(batch.n_tokens)] = 1
+
+        i_batch[i] = batch.n_tokens
+
+        batch.n_tokens += 1
+
+        n_decode += 1
+    }
+
+    // all streams are finished
+    if batch.n_tokens == 0 {
+        break
+    }
+
+    n_cur += 1
+
+    // evaluate the current batch with the transformer model
+    if llama_decode(context, batch) != 0 {
+        print("llama_decode() failed")
+        exit(1)
+    }
+}
+
+if n_parallel > 1 {
+    print("\n")
+    for (i, stream) in streams.enumerated() {
+        print("sequence \(i):\n\n\(prompt)\(stream)\n")
+    }
+}
+
+let t_main_end = ggml_time_us()
+
+print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
+
+llama_print_timings(context)
+
+private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
+    let n_tokens = text.count + (add_bos ? 1 : 0)
+    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    var swiftTokens: [llama_token] = []
+    for i in 0 ..< tokenCount {
+        swiftTokens.append(tokens[Int(i)])
+    }
+    tokens.deallocate()
+    return swiftTokens
+}
+
+private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
+    var result = [CChar](repeating: 0, count: 8)
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+    if nTokens < 0 {
+        if result.count >= -Int(nTokens) {
+            result.removeLast(-Int(nTokens))
+        } else {
+            result.removeAll()
+        }
+        let check = llama_token_to_piece(
+            model,
+            token,
+            &result,
+            Int32(result.count)
+        )
+        assert(check == nTokens)
+    } else {
+        result.removeLast(result.count - Int(nTokens))
+    }
+    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
+        return utfString
+    } else {
+        buffer.append(contentsOf: result)
+        let data = Data(buffer.map { UInt8(bitPattern: $0) })
+        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
+            buffer = []
+        }
+        guard let bufferString = String(data: data, encoding: .utf8) else {
+            return nil
+        }
+        buffer = []
+        return bufferString
+    }
+    return nil
+}
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -11,12 +11,16 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
        return 1 ;
    }

+    // number of parallel batches
    int n_parallel = 1;

+    // total length of the sequences including the prompt
+    int n_len = 32;
+
    if (argc >= 2) {
        params.model = argv[1];
    }
@ -29,13 +33,14 @@ int main(int argc, char ** argv) {
        n_parallel = std::atoi(argv[3]);
    }

+    if (argc >= 5) {
+        n_len = std::atoi(argv[4]);
+    }
+
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

-    // total length of the sequences including the prompt
-    const int n_len = 32;
-
    // init LLM

    llama_backend_init(params.numa);
@ -66,7 +71,7 @@ int main(int argc, char ** argv) {
    ctx_params.seed  = 1234;
    ctx_params.n_ctx = n_kv_req;
    ctx_params.n_batch = std::max(n_len, n_parallel);
-    ctx_params.n_threads = params.n_threads;
+    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@ -97,20 +102,15 @@ int main(int argc, char ** argv) {

    fflush(stderr);

-    // create a llama_batch with size 512
+    // create a llama_batch
    // we use this object to submit token data for decoding
-
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);

    // evaluate the initial prompt
-    batch.n_tokens = tokens_list.size();
-
-    for (int32_t i = 0; i < batch.n_tokens; i++) {
-        batch.token[i]  = tokens_list[i];
-        batch.pos[i]    = i;
-        batch.seq_id[i] = 0;
-        batch.logits[i] = false;
+    for (size_t i = 0; i < tokens_list.size(); ++i) {
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
    }
+    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());

    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;
@ -146,7 +146,7 @@ int main(int argc, char ** argv) {

    while (n_cur <= n_len) {
        // prepare the next batch
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);

        // sample the next token for each parallel sequence / stream
        for (int32_t i = 0; i < n_parallel; ++i) {
@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
@ -198,15 +198,10 @@ int main(int argc, char ** argv) {

            streams[i] += llama_token_to_piece(ctx, new_token_id);

-            // push this new token for next evaluation
-            batch.token [batch.n_tokens] = new_token_id;
-            batch.pos   [batch.n_tokens] = n_cur;
-            batch.seq_id[batch.n_tokens] = i;
-            batch.logits[batch.n_tokens] = true;
-
            i_batch[i] = batch.n_tokens;

-            batch.n_tokens += 1;
+            // push this new token for next evaluation
+            llama_batch_add(batch, new_token_id, n_cur, { i }, true);

            n_decode += 1;
        }
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
 }

 // Function matching type llama_beam_search_callback_fn_t.
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
    if (file.size < 4) {
        return false;
    }
-    uint32_t magic = file.read_u32();
+    std::string magic = file.read_string(4);
    return magic == GGUF_MAGIC;
 }

--- a/examples/embd-input/.gitignore
+++ b/examples/embd-input/.gitignore
@ -1,4 +0,0 @@
-PandaGPT
-MiniGPT-4
-*.pth
-
--- a/examples/embd-input/CMakeLists.txt
+++ b/examples/embd-input/CMakeLists.txt
@ -1,17 +0,0 @@
-set(TARGET embdinput)
-add_library(${TARGET} embd-input-lib.cpp embd-input.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
-
-set(TARGET embd-input-test)
-add_executable(${TARGET} embd-input-test.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/embd-input/README.md
+++ b/examples/embd-input/README.md
@ -1,63 +0,0 @@
-### Examples for input embedding directly
-
-## Requirement
-build  `libembdinput.so`
-run the following comman in main dir (../../).
-```
-make
-```
-
-## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
-
-1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
-2. Convert it to ggml format.
-3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
-
-```
-import torch
-
-bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
-pth_path = "./examples/embd-input/llava_projection.pth"
-
-dic = torch.load(bin_path)
-used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
-torch.save({k: dic[k] for k in used_key}, pth_path)
-```
-4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
-
-
-## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
-
-1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
-The `adapter_config.json` is
-```
-{
-  "peft_type": "LORA",
-  "fan_in_fan_out": false,
-  "bias": null,
-  "modules_to_save": null,
-  "r": 32,
-  "lora_alpha": 32,
-  "lora_dropout": 0.1,
-  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
-}
-```
-2. Papare the `vicuna` v0 model.
-3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
-4. Clone the PandaGPT source.
-```
-git clone https://github.com/yxuansu/PandaGPT
-```
-5. Install the requirement of PandaGPT.
-6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
-
-## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
-
-1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
-2. Clone the MiniGPT-4 source.
-```
-git clone https://github.com/Vision-CAIR/MiniGPT-4/
-```
-3. Install the requirement of PandaGPT.
-4. Papare the `vicuna` v0 model.
-5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -1,220 +0,0 @@
-#include "build-info.h"
-#include "common.h"
-#include "embd-input.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-static llama_context ** g_ctx;
-
-extern "C" {
-
-struct MyModel* create_mymodel(int argc, char ** argv) {
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return nullptr;
-    }
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = uint32_t(time(NULL));
-    }
-    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
-
-    llama_backend_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    g_ctx = &ctx;
-
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return nullptr;
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
-    }
-    struct MyModel * ret = new MyModel();
-    ret->ctx = ctx;
-    ret->params = params;
-    ret->n_past = 0;
-    // printf("ctx: %d\n", ret->ctx);
-    return ret;
-}
-
-void free_mymodel(struct MyModel * mymodel) {
-    llama_context * ctx = mymodel->ctx;
-    llama_print_timings(ctx);
-    llama_free(ctx);
-    delete mymodel;
-}
-
-
-bool eval_float(void * model, float * input, int N){
-    MyModel * mymodel = (MyModel*)model;
-    llama_context * ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(llama_get_model(ctx));
-    int n_past = mymodel->n_past;
-    int n_batch = N; // params.n_batch;
-
-    for (int i = 0; i < (int) N; i += n_batch) {
-        int n_eval = (int) N - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
-        if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-bool eval_tokens(void * model, std::vector<llama_token> tokens) {
-    MyModel * mymodel = (MyModel* )model;
-    llama_context * ctx;
-    ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    int n_past = mymodel->n_past;
-    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > params.n_batch) {
-            n_eval = params.n_batch;
-        }
-        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-bool eval_id(struct MyModel* mymodel, int id) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(mymodel, tokens);
-}
-
-bool eval_string(struct MyModel * mymodel,const char* str){
-    llama_context * ctx = mymodel->ctx;
-    std::string str2 = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
-    eval_tokens(mymodel, embd_inp);
-    return true;
-}
-
-llama_token sampling_id(struct MyModel* mymodel) {
-    llama_context* ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    // int n_ctx = llama_n_ctx(ctx);
-
-    // out of user input, sample next token
-    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
-    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    // const float   repeat_penalty  = params.repeat_penalty;
-    // const float   alpha_presence  = params.presence_penalty;
-    // const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-    // const bool    penalize_nl     = params.penalize_nl;
-
-    llama_token id = 0;
-    {
-        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-        // Apply params.logit_bias map
-        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-            logits[it->first] += it->second;
-        }
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl(ctx)];
-        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-        // llama_sample_repetition_penalty(ctx, &candidates_p,
-        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        //      last_n_repeat, repeat_penalty);
-        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        // last_n_repeat, alpha_frequency, alpha_presence);
-        // if (!penalize_nl) {
-        //     logits[llama_token_nl(ctx)] = nl_logit;
-        // }
-
-        if (temp <= 0) {
-            // Greedy sampling
-            id = llama_sample_token_greedy(ctx, &candidates_p);
-        } else {
-            if (mirostat == 1) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                const int mirostat_m = 100;
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-            } else if (mirostat == 2) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-            } else {
-                // Temperature sampling
-                llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token(ctx, &candidates_p);
-            }
-        }
-    }
-
-    return id;
-}
-
-const char * sampling(struct MyModel * mymodel) {
-    llama_context * ctx = mymodel->ctx;
-    int id = sampling_id(mymodel);
-    static std::string ret;
-    if (id == llama_token_eos(ctx)) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_piece(ctx, id);
-    }
-    eval_id(mymodel, id);
-    return ret.c_str();
-}
-
-}
--- a/examples/embd-input/embd-input-test.cpp
+++ b/examples/embd-input/embd-input-test.cpp
@ -1,35 +0,0 @@
-#include "embd-input.h"
-#include <stdlib.h>
-#include <random>
-#include <string.h>
-
-int main(int argc, char** argv) {
-
-    auto mymodel = create_mymodel(argc, argv);
-    int N = 10;
-    int max_tgt_len = 500;
-    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
-
-    // add random float embd to test evaluation
-    float * data = new float[N*n_embd];
-    std::default_random_engine e;
-    std::uniform_real_distribution<float>  u(0,1);
-    for (int i=0;i<N*n_embd;i++) {
-        data[i] = u(e);
-    }
-
-    eval_string(mymodel, "user: what is the color of the flag of UN?");
-    eval_float(mymodel, data, N);
-    eval_string(mymodel, "assistant:");
-    eval_string(mymodel, mymodel->params.prompt.c_str());
-    const char* tmp;
-    for (int i=0; i<max_tgt_len; i++) {
-        tmp = sampling(mymodel);
-        if (strcmp(tmp, "</s>")==0) break;
-        printf("%s", tmp);
-        fflush(stdout);
-    }
-    printf("\n");
-    free_mymodel(mymodel);
-    return 0;
-}
--- a/examples/embd-input/embd-input.h
+++ b/examples/embd-input/embd-input.h
@ -1,27 +0,0 @@
-#ifndef _EMBD_INPUT_H_
-#define _EMBD_INPUT_H_ 1
-
-#include "common.h"
-#include "llama.h"
-
-extern "C" {
-
-typedef struct MyModel {
-    llama_context* ctx;
-    gpt_params params;
-    int n_past = 0;
-} MyModel;
-
-struct MyModel* create_mymodel(int argc, char ** argv);
-
-bool eval_float(void* model, float* input, int N);
-bool eval_tokens(void* model, std::vector<llama_token> tokens);
-bool eval_id(struct MyModel* mymodel, int id);
-bool eval_string(struct MyModel* mymodel, const char* str);
-const char * sampling(struct MyModel* mymodel);
-llama_token sampling_id(struct MyModel* mymodel);
-void free_mymodel(struct MyModel* mymodel);
-
-}
-
-#endif
--- a/examples/embd-input/embd_input.py
+++ b/examples/embd-input/embd_input.py
@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
-import numpy as np
-import os
-
-libc = cdll.LoadLibrary("./libembdinput.so")
-libc.sampling.restype=c_char_p
-libc.create_mymodel.restype=c_void_p
-libc.eval_string.argtypes=[c_void_p, c_char_p]
-libc.sampling.argtypes=[c_void_p]
-libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
-
-
-class MyModel:
-    def __init__(self, args):
-        argc = len(args)
-        c_str = [c_char_p(i.encode()) for i in args]
-        args_c = (c_char_p * argc)(*c_str)
-        self.model = c_void_p(libc.create_mymodel(argc, args_c))
-        self.max_tgt_len = 512
-        self.print_string_eval = True
-
-    def __del__(self):
-        libc.free_mymodel(self.model)
-
-    def eval_float(self, x):
-        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
-
-    def eval_string(self, x):
-        libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
-        if self.print_string_eval:
-            print(x)
-
-    def eval_token(self, x):
-        libc.eval_id(self.model, x)
-
-    def sampling(self):
-        s = libc.sampling(self.model)
-        return s
-
-    def stream_generate(self, end="</s>"):
-        ret = b""
-        end = end.encode()
-        for _ in range(self.max_tgt_len):
-            tmp = self.sampling()
-            ret += tmp
-            yield tmp
-            if ret.endswith(end):
-                break
-
-    def generate_with_print(self, end="</s>"):
-        ret = b""
-        for i in self.stream_generate(end=end):
-            ret += i
-            print(i.decode(errors="replace"), end="", flush=True)
-        print("")
-        return ret.decode(errors="replace")
-
-
-    def generate(self, end="</s>"):
-        text = b"".join(self.stream_generate(end=end))
-        return text.decode(errors="replace")
-
-if __name__ == "__main__":
-    model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
-    model.eval_string("""user: what is the color of the flag of UN?""")
-    x = np.random.random((5120,10))# , dtype=np.float32)
-    model.eval_float(x)
-    model.eval_string("""assistant:""")
-    for i in model.generate():
-        print(i.decode(errors="replace"), end="", flush=True)
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from transformers import CLIPVisionModel,  CLIPImageProcessor
-from PIL import Image
-
-# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
-vision_tower = "openai/clip-vit-large-patch14"
-select_hidden_state_layer = -2
-# (vision_config.image_size // vision_config.patch_size) ** 2
-image_token_len = (224//14)**2
-
-class Llava:
-    def __init__(self, args):
-        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
-        self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
-        self.mm_projector = nn.Linear(1024, 5120)
-        self.model = MyModel(["main", *args])
-
-    def load_projection(self, path):
-        state = torch.load(path)
-        self.mm_projector.load_state_dict({
-            "weight": state["model.mm_projector.weight"],
-            "bias": state["model.mm_projector.bias"]})
-
-    def chat(self, question):
-        self.model.eval_string("user: ")
-        self.model.eval_string(question)
-        self.model.eval_string("\nassistant: ")
-        return self.model.generate_with_print()
-
-    def chat_with_image(self, image, question):
-        with torch.no_grad():
-            embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-            image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
-            select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
-            image_feature = select_hidden_state[:, 1:]
-            embd_image = self.mm_projector(image_feature)
-            embd_image = embd_image.cpu().numpy()[0]
-        self.model.eval_string("user: ")
-        self.model.eval_token(32003-2) # im_start
-        self.model.eval_float(embd_image.T)
-        for i in range(image_token_len-embd_image.shape[0]):
-            self.model.eval_token(32003-3) # im_patch
-        self.model.eval_token(32003-1) # im_end
-        self.model.eval_string(question)
-        self.model.eval_string("\nassistant: ")
-        return self.model.generate_with_print()
-
-
-if __name__=="__main__":
-    # model form liuhaotian/LLaVA-13b-delta-v1-1
-    a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
-    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
-    # Also here can use pytorch_model-00003-of-00003.bin directly.
-    a.load_projection(os.path.join(
-        os.path.dirname(__file__) ,
-        "llava_projection.pth"))
-    respose = a.chat_with_image(
-        Image.open("./media/llama1-logo.png").convert('RGB'),
-        "what is the text in the picture?")
-    respose
-    a.chat("what is the color of it?")
-
-
-
--- a/examples/embd-input/minigpt4.py
+++ b/examples/embd-input/minigpt4.py
@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from PIL import Image
-
-minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
-sys.path.insert(0, minigpt4_path)
-from minigpt4.models.blip2 import Blip2Base
-from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
-
-
-class MiniGPT4(Blip2Base):
-    """
-    MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
-    """
-    def __init__(self,
-        args,
-        vit_model="eva_clip_g",
-        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
-        img_size=224,
-        drop_path_rate=0,
-        use_grad_checkpoint=False,
-        vit_precision="fp32",
-        freeze_vit=True,
-        freeze_qformer=True,
-        num_query_token=32,
-        llama_model="",
-        prompt_path="",
-        prompt_template="",
-        max_txt_len=32,
-        end_sym='\n',
-        low_resource=False,  # use 8 bit and put vit in cpu
-        device_8bit=0
-    ):
-        super().__init__()
-        self.img_size = img_size
-        self.low_resource = low_resource
-        self.preprocessor = Blip2ImageEvalProcessor(img_size)
-
-        print('Loading VIT')
-        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
-            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
-        )
-        print('Loading VIT Done')
-        print('Loading Q-Former')
-        self.Qformer, self.query_tokens = self.init_Qformer(
-            num_query_token, self.visual_encoder.num_features
-        )
-        self.Qformer.cls = None
-        self.Qformer.bert.embeddings.word_embeddings = None
-        self.Qformer.bert.embeddings.position_embeddings = None
-        for layer in self.Qformer.bert.encoder.layer:
-            layer.output = None
-            layer.intermediate = None
-        self.load_from_pretrained(url_or_filename=q_former_model)
-        print('Loading Q-Former Done')
-        self.llama_proj = nn.Linear(
-            self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
-        )
-        self.max_txt_len = max_txt_len
-        self.end_sym = end_sym
-        self.model = MyModel(["main", *args])
-        # system prompt
-        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
-           "You will be able to see the image once I provide it to you. Please answer my questions."
-           "###")
-
-    def encode_img(self, image):
-        image = self.preprocessor(image)
-        image = image.unsqueeze(0)
-        device = image.device
-        if self.low_resource:
-            self.vit_to_cpu()
-            image = image.to("cpu")
-
-        with self.maybe_autocast():
-            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
-            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
-
-            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
-            query_output = self.Qformer.bert(
-                query_embeds=query_tokens,
-                encoder_hidden_states=image_embeds,
-                encoder_attention_mask=image_atts,
-                return_dict=True,
-            )
-
-            inputs_llama = self.llama_proj(query_output.last_hidden_state)
-            # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
-        return inputs_llama
-
-    def load_projection(self, path):
-        state = torch.load(path)["model"]
-        self.llama_proj.load_state_dict({
-            "weight": state["llama_proj.weight"],
-            "bias": state["llama_proj.bias"]})
-
-    def chat(self, question):
-        self.model.eval_string("Human: ")
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        return self.model.generate_with_print(end="###")
-
-    def chat_with_image(self, image, question):
-        with torch.no_grad():
-            embd_image = self.encode_img(image)
-        embd_image = embd_image.cpu().numpy()[0]
-        self.model.eval_string("Human: <Img>")
-        self.model.eval_float(embd_image.T)
-        self.model.eval_string("</Img> ")
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        return self.model.generate_with_print(end="###")
-
-
-if __name__=="__main__":
-    a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
-    a.load_projection(os.path.join(
-        os.path.dirname(__file__) ,
-        "pretrained_minigpt4.pth"))
-    respose = a.chat_with_image(
-        Image.open("./media/llama1-logo.png").convert('RGB'),
-        "what is the text in the picture?")
-    a.chat("what is the color of it?")
--- a/examples/embd-input/panda_gpt.py
+++ b/examples/embd-input/panda_gpt.py
@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-
-# use PandaGPT path
-panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
-imagebind_ckpt_path = "./models/panda_gpt/"
-
-sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
-from ImageBind.models import imagebind_model
-from ImageBind import data
-
-ModalityType = imagebind_model.ModalityType
-max_tgt_len = 400
-
-class PandaGPT:
-    def __init__(self, args):
-        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
-        self.visual_encoder.eval()
-        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
-        self.max_tgt_len = max_tgt_len
-        self.model = MyModel(["main", *args])
-        self.generated_text = ""
-        self.device = "cpu"
-
-    def load_projection(self, path):
-        state = torch.load(path, map_location="cpu")
-        self.llama_proj.load_state_dict({
-            "weight": state["llama_proj.weight"],
-            "bias": state["llama_proj.bias"]})
-
-    def eval_inputs(self, inputs):
-        self.model.eval_string("<Img>")
-        embds = self.extract_multimoal_feature(inputs)
-        for i in embds:
-            self.model.eval_float(i.T)
-        self.model.eval_string("</Img> ")
-
-    def chat(self, question):
-        return self.chat_with_image(None, question)
-
-    def chat_with_image(self, inputs, question):
-        if self.generated_text == "":
-            self.model.eval_string("###")
-        self.model.eval_string(" Human: ")
-        if inputs:
-            self.eval_inputs(inputs)
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        ret = self.model.generate_with_print(end="###")
-        self.generated_text += ret
-        return ret
-
-    def extract_multimoal_feature(self, inputs):
-        features = []
-        for key in ["image", "audio", "video", "thermal"]:
-            if key + "_paths" in inputs:
-                embeds = self.encode_data(key, inputs[key+"_paths"])
-                features.append(embeds)
-        return features
-
-    def encode_data(self, data_type, data_paths):
-
-        type_map = {
-            "image": ModalityType.VISION,
-            "audio": ModalityType.AUDIO,
-            "video": ModalityType.VISION,
-            "thermal": ModalityType.THERMAL,
-        }
-        load_map = {
-            "image": data.load_and_transform_vision_data,
-            "audio": data.load_and_transform_audio_data,
-            "video": data.load_and_transform_video_data,
-            "thermal": data.load_and_transform_thermal_data
-        }
-
-        load_function = load_map[data_type]
-        key = type_map[data_type]
-
-        inputs = {key: load_function(data_paths, self.device)}
-        with torch.no_grad():
-            embeddings = self.visual_encoder(inputs)
-            embeds = embeddings[key]
-            embeds = self.llama_proj(embeds).cpu().numpy()
-        return embeds
-
-
-if __name__=="__main__":
-    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
-    a.load_projection("./models/panda_gpt/adapter_model.bin")
-    a.chat_with_image(
-        {"image_paths": ["./media/llama1-logo.png"]},
-        "what is the text in the picture? 'llama' or 'lambda'?")
-    a.chat("what is the color of it?")
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
    set_param_lora(lora);

    // measure data size
-    struct ggml_allocr * alloc = NULL;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    alloc_lora(alloc, lora);
+    size_t size = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
+    }

    // allocate data
-    lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
-    ggml_allocr_free(alloc);
+    struct ggml_allocr * alloc = NULL;
+    lora->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
    alloc_lora(alloc, lora);
    ggml_allocr_free(alloc);
@ -1714,11 +1715,9 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);

    // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
+                            tensor_alignment;
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));

    // allocate input tensors
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -39,8 +39,8 @@ static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting = false;

+static bool is_interacting = false;

 static void write_logfile(
    const llama_context * ctx, const gpt_params & params, const llama_model * model,
@ -104,6 +104,7 @@ static void sigint_handler(int signo) {

 int main(int argc, char ** argv) {
    gpt_params params;
+    llama_sampling_params & sparams = params.sparams;
    g_params = &params;

    if (!gpt_params_parse(argc, argv, params)) {
@ -206,7 +207,7 @@ int main(int argc, char ** argv) {
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
+    if (sparams.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
        ctx_guidance = llama_new_context_with_model(model, lparams);
    }
@ -233,23 +234,35 @@ int main(int argc, char ** argv) {
    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
    LOG("add_bos: %d\n", add_bos);

+    bool suff_rm_leading_spc = params.escape;
+    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+        params.input_suffix.erase(0, 1);
+        suff_rm_leading_spc = false;
+    }
    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
+    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
+    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+    const int space_token = 29871;
+    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+        inp_sfx.erase(inp_sfx.begin());
+    }
+    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
+    if (add_bos) {
+        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
+    }
+    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
    embd_inp = inp_pfx;
    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(ctx));
+    embd_inp.push_back(llama_token_middle(model));

    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());

    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        embd_inp.push_back(llama_token_bos(model));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

    // Tokenize negative prompt
@ -257,13 +270,13 @@ int main(int argc, char ** argv) {
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@ -281,8 +294,8 @@ int main(int argc, char ** argv) {
        params.n_keep = (int)embd_inp.size();
    }

-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());


    // enable interactive mode if interactive start is specified
@ -300,7 +313,7 @@ int main(int argc, char ** argv) {

        if (ctx_guidance) {
            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@ -345,39 +358,10 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = params.logit_bias.find(llama_token_eos(ctx));
-            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
-            }
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    LOG_TEE("\n#####  Infill mode  #####\n\n");
    if (params.infill) {
        printf("\n************\n");
@ -420,10 +404,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    const int n_vocab = llama_n_vocab(model);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);

    while (n_remain != 0 || params.interactive) {
        // predict
@ -470,7 +451,7 @@ int main(int argc, char ** argv) {

                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);

-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

            }

@ -498,7 +479,7 @@ int main(int argc, char ** argv) {
                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();

-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                } else {
                    input_buf  = embd.data();
                    input_size = embd.size();
@ -521,7 +502,7 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                    LOG_TEE("%s : failed to eval\n", __func__);
@ -540,12 +521,11 @@ int main(int argc, char ** argv) {

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {

-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);

-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

            embd.push_back(id);

@ -561,8 +541,11 @@ int main(int argc, char ** argv) {
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@ -594,10 +577,10 @@ int main(int argc, char ** argv) {
        if ((int) embd_inp.size() <= n_consumed) {

            // deal with eot token in infill mode
-            if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                if(is_interacting && !params.interactive_first) {
                    // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
+                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
                fflush(stdout);
                printf("\n");
@ -611,7 +594,7 @@ int main(int argc, char ** argv) {
                    buffer += line;
                } while (another_line);
                // check if we got an empty line, if so we use the old input
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_prefix = buffer;
                }
                buffer.clear();
@ -621,20 +604,37 @@ int main(int argc, char ** argv) {
                    buffer += line;
                } while (another_line);
                // check if we got an empty line
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_suffix = buffer;
                }
                buffer.clear();
                // done taking input, reset color
                console::set_display(console::reset);
+
+                if (params.escape) {
+                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
+                    process_escapes(params.input_prefix);
+                    process_escapes(params.input_suffix);
+                }
+                suff_rm_leading_spc = params.escape;
+                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
+                    params.input_suffix.erase(0, 1);
+                    suff_rm_leading_spc = false;
+                }
                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
+                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+                    inp_sfx.erase(inp_sfx.begin());
+                }
+                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
+                if (add_bos) {
+                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
+                }
+                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
                embd_inp = inp_pfx;
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(ctx));
+                embd_inp.push_back(llama_token_middle(model));
                embd.clear();
                embd_guidance.clear();
                n_remain = params.n_predict;
@ -644,7 +644,7 @@ int main(int argc, char ** argv) {
                is_interacting = false;
            }
            // deal with end of text token in interactive mode
-            else if (last_tokens.back() == llama_token_eos(ctx)) {
+            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                LOG("found EOS token\n");

                if (params.interactive) {
@ -661,7 +661,7 @@ int main(int argc, char ** argv) {

                if (params.input_prefix_bos) {
                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
@ -696,7 +696,7 @@ int main(int argc, char ** argv) {
                    const size_t original_size = embd_inp.size();

                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

@ -717,22 +717,14 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                }
                is_interacting = false;
            }
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
            break;
        }

@ -744,7 +736,7 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
+        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
        fflush(stdout);
    }

@ -755,9 +747,7 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);

-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -933,7 +933,7 @@ struct sql_printer : public printer {
 };

 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
-    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
+    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
    int n_processed = 0;

    llama_set_n_threads(ctx, n_threads, n_threads);
@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 }

 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
-    llama_token token = llama_token_bos(ctx);
+    llama_token token = llama_token_bos(llama_get_model(ctx));

    llama_set_n_threads(ctx, n_threads, n_threads);

--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -0,0 +1,20 @@
+set(TARGET clip)
+add_library(${TARGET} clip.cpp clip.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+    endif()
+if(TARGET BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
+endif()
+
+set(TARGET llava)
+add_executable(${TARGET} llava.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -0,0 +1,57 @@
+# LLaVA
+
+Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
+
+The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+models are available.
+
+After API is confirmed, more models will be supported / uploaded.
+
+## Usage
+Build with cmake or run `make llava` to build it.
+
+After building, run: `./llava` to see the usage. For example:
+
+```sh
+./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+```
+
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+
+## Model conversion
+
+- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
+
+```sh
+git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
+```
+
+3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+```
+
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./convert.py ../llava-v1.5-7b
+```
+
+Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
+
+## TODO
+
+- [ ] Support server mode.
+- [ ] Support non-CPU backend for the image encoding part.
+- [ ] Support different sampling methods.
+- [ ] Support more model variants.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -0,0 +1,73 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include "ggml.h"
+
+struct clip_ctx;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_vision_hparams {
+    int32_t image_size;
+    int32_t patch_size;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t projection_dim;
+    int32_t n_head;
+    int32_t n_layer;
+    float eps;
+};
+
+struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+
+void clip_free(struct clip_ctx * ctx);
+
+size_t clip_embd_nbytes(struct clip_ctx * ctx);
+int clip_n_patches(struct clip_ctx * ctx);
+int clip_n_mmproj_embd(struct clip_ctx * ctx);
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+    uint8_t * data;
+    size_t size;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+    float * data;
+    size_t size;
+};
+
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};
+
+struct clip_image_u8 * make_clip_image_u8();
+struct clip_image_f32 * make_clip_image_f32();
+bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
+bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
+
+bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
+                             float * vec);
+
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CLIP_H
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -0,0 +1,250 @@
+import argparse
+import os
+import json
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import CLIPModel, CLIPProcessor
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+
+    if "mm_projector" in name:
+        return name.replace("model.mm_projector", "mm")
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
+ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+    tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    v_hparams = config["vision_config"]
+    t_hparams = config["text_config"]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+
+model = CLIPModel.from_pretrained(dir_model)
+processor = CLIPProcessor.from_pretrained(dir_model)
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+    fout.add_description("image encoder for LLaVA")
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+
+    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
+    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = v_hparams["hidden_act"] == "gelu"
+fout.add_bool("clip.use_gelu", use_gelu)
+
+
+if has_llava_projector:
+    model.vision_model.encoder.layers.pop(-1)
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        if data.ndim == 2:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+
+        fout.add_tensor(name, data)
+
+    print("Projector tensors added\n")
+
+state_dict = model.state_dict()
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@ -0,0 +1,46 @@
+import argparse
+import glob
+import os
+import torch
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
+checkpoint = torch.load(path)
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/llava.projector")
+
+# remove these tensors from the checkpoint and save it again
+for name in mm_tensors:
+    del checkpoint[name]
+
+# BakLLaVA models contain CLIP tensors in it
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/llava.clip")
+
+    # remove these tensors
+    for name in clip_tensors:
+        del checkpoint[name]
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+
+torch.save(checkpoint, path)
+
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@ -0,0 +1,147 @@
+#pragma once
+
+// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
+
+#include "common.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = N - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        if (llama_decode(ctx_llama, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+// TODO: use common/sampling.h
+inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
+    auto & sparams = params.sparams;
+
+    // out of user input, sample next token
+    const float   temp      = sparams.temp;
+    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
+    const float   top_p     = sparams.top_p;
+    const float   tfs_z     = sparams.tfs_z;
+    const float   typical_p = sparams.typical_p;
+    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
+    // const float   repeat_penalty  = sparams.repeat_penalty;
+    // const float   alpha_presence  = sparams.presence_penalty;
+    // const float   alpha_frequency = sparams.frequency_penalty;
+    const int     mirostat     = sparams.mirostat;
+    const float   mirostat_tau = sparams.mirostat_tau;
+    const float   mirostat_eta = sparams.mirostat_eta;
+    // const bool    penalize_nl     = sparams.penalize_nl;
+
+    llama_token id = 0;
+    {
+        auto logits  = llama_get_logits(ctx_llama);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
+
+        // Apply params.logit_bias map
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
+            logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+        // TODO: Apply penalties
+        // float nl_logit = logits[llama_token_nl(ctx)];
+        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+        // llama_sample_repetition_penalty(ctx, &candidates_p,
+        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        //      last_n_repeat, repeat_penalty);
+        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        // last_n_repeat, alpha_frequency, alpha_presence);
+        // if (!penalize_nl) {
+        //     logits[llama_token_nl(ctx)] = nl_logit;
+        // }
+
+        if (temp <= 0) {
+              // Greedy sampling
+            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
+        } else {
+            if (mirostat == 1) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                const  int mirostat_m    = 100;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+            } else if (mirostat == 2) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+            } else {
+                  // Temperature sampling
+                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
+                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
+                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
+                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token(ctx_llama, &candidates_p);
+            }
+        }
+    }
+
+    return id;
+}
+
+inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
+    int id = sample_id(ctx_llama, params);
+    static std::string ret;
+    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -0,0 +1,164 @@
+#include "clip.h"
+#include "llava-utils.h"
+#include "common.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    if (params.mmproj.empty() || params.image.empty()) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    const char * clip_path = params.mmproj.c_str();
+    const char * img_path = params.image.c_str();
+
+    if (params.prompt.empty()) {
+        params.prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+    // load and preprocess the image
+    clip_image_u8 img;
+    clip_image_f32 img_res;
+
+    if (!clip_image_load_from_file(img_path, &img)) {
+        fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
+
+        clip_free(ctx_clip);
+        return 1;
+    }
+
+    if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
+        fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
+
+        clip_free(ctx_clip);
+        return 1;
+    }
+
+    int n_img_pos  = clip_n_patches(ctx_clip);
+    int n_img_embd = clip_n_mmproj_embd(ctx_clip);
+
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+
+    if (!image_embd) {
+        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+
+        return 1;
+    }
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+    if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
+        fprintf(stderr, "Unable to encode image\n");
+
+        return 1;
+    }
+    const int64_t t_img_enc_end_us = ggml_time_us();
+
+    // we get the embeddings, free up the memory required for CLIP
+    clip_free(ctx_clip);
+
+    llama_backend_init(params.numa);
+
+    llama_model_params model_params              = llama_model_default_params();
+                       model_params.n_gpu_layers = params.n_gpu_layers;
+                       model_params.main_gpu     = params.main_gpu;
+                       model_params.tensor_split = params.tensor_split;
+                       model_params.use_mmap     = params.use_mmap;
+                       model_params.use_mlock    = params.use_mlock;
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.seed            = params.seed;
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    // make sure that the correct mmproj was used, i.e., compare apples to apples
+    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+
+    if (n_img_embd != n_llama_embd) {
+        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
+
+        llama_free(ctx_llama);
+        llama_free_model(model);
+        llama_backend_free();
+        free(image_embd);
+
+        return 1;
+    }
+
+    // process the prompt
+    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
+
+    int n_past = 0;
+
+    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
+
+    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
+    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
+    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
+
+    // generate the response
+
+    printf("\n");
+    printf("prompt: '%s'\n", params.prompt.c_str());
+    printf("\n");
+
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_llama, params, &n_past);
+        if (strcmp(tmp, "</s>") == 0) break;
+
+        printf("%s", tmp);
+        fflush(stdout);
+    }
+
+    printf("\n");
+
+    {
+        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+
+        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
+    }
+
+    llama_print_timings(ctx_llama);
+
+    llama_free(ctx_llama);
+    llama_free_model(model);
+    llama_backend_free();
+    free(image_embd);
+
+    return 0;
+}
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -3,7 +3,6 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
-#include "grammar-parser.h"

 #include <cassert>
 #include <cinttypes>
@ -113,6 +112,7 @@ int main(int argc, char ** argv) {
    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }
+    llama_sampling_params & sparams = params.sparams;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
+    if (sparams.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
        ctx_guidance = llama_new_context_with_model(model, lparams);
    }
@ -245,19 +245,19 @@ int main(int argc, char ** argv) {

    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
    }

    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());

    // Should not run without any tokens
    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        embd_inp.push_back(llama_token_bos(model));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

    // Tokenize negative prompt
@ -265,13 +265,13 @@ int main(int argc, char ** argv) {
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@ -304,6 +304,9 @@ int main(int argc, char ** argv) {
            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        }
+
+        // remove any "future" tokens that we might have inherited from the previous session
+        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
    }

    LOGLN(
@ -324,11 +327,11 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);

-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());

    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
@ -351,7 +354,7 @@ int main(int argc, char ** argv) {

        if (ctx_guidance) {
            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@ -387,6 +390,12 @@ int main(int argc, char ** argv) {
        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                if (params.verbose_prompt) {
+                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
            }
        }

@ -396,45 +405,27 @@ int main(int argc, char ** argv) {

        if (!params.input_prefix.empty()) {
            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
        }

        if (!params.input_suffix.empty()) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-        }
-    }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
-
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = params.logit_bias.find(llama_token_eos(ctx));
-            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
            }
        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("\n\n");

    if (params.interactive) {
        const char *control_message;
@ -475,10 +466,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    const int n_vocab = llama_n_vocab(model);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@ -525,7 +513,7 @@ int main(int argc, char ** argv) {

                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);

-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

                LOG("clear session path\n");
                path_session.clear();
@ -551,14 +539,10 @@ int main(int argc, char ** argv) {
                if (i > 0) {
                    embd.erase(embd.begin(), embd.begin() + i);
                }
-
-                // remove any "future" tokens that we might have inherited from the session from the KV cache
-                llama_kv_cache_tokens_rm(ctx, n_past, -1);
            }

            // evaluate tokens in batches
            // embd is typically prepared beforehand to fit within a batch, but not always
-
            if (ctx_guidance) {
                int input_size = 0;
                llama_token * input_buf = NULL;
@ -580,7 +564,7 @@ int main(int argc, char ** argv) {
                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();

-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                } else {
                    input_buf  = embd.data();
                    input_size = embd.size();
@ -603,7 +587,7 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                    LOG_TEE("%s : failed to eval\n", __func__);
@ -633,12 +617,11 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);

-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());

            embd.push_back(id);

@ -654,8 +637,11 @@ int main(int argc, char ** argv) {
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@ -685,12 +671,10 @@ int main(int argc, char ** argv) {

        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt
+            // check for reverse prompt in the last n_prev tokens
            if (!params.antiprompt.empty()) {
-                std::string last_output;
-                for (auto id : last_tokens) {
-                    last_output += llama_token_to_piece(ctx, id);
-                }
+                const int n_prev = 32;
+                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
@ -717,13 +701,13 @@ int main(int argc, char ** argv) {
            }

            // deal with end of text token in interactive mode
-            if (last_tokens.back() == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                LOG("found EOS token\n");

                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
                        // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                        is_antiprompt = true;
                    }
@ -744,14 +728,13 @@ int main(int argc, char ** argv) {

                if (params.input_prefix_bos) {
                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    printf("%s", params.input_prefix.c_str());
                }

                // color user input only
@ -773,7 +756,6 @@ int main(int argc, char ** argv) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
                        printf("%s", params.input_suffix.c_str());
                    }

@ -787,11 +769,18 @@ int main(int argc, char ** argv) {
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }
+                    if (params.escape) {
+                        process_escapes(buffer);
+                    }

-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());

                    // instruct mode: insert response suffix
                    if (params.instruct) {
@ -816,22 +805,14 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                }
                is_interacting = false;
            }
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
            LOG_TEE(" [end of text]\n");
            break;
        }
@ -856,9 +837,7 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);

-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -51,6 +51,12 @@ static std::vector<std::string> k_prompts = {
 };

 struct client {
+    ~client() {
+        if (ctx_sampling) {
+            llama_sampling_free(ctx_sampling);
+        }
+    }
+
    int32_t id = 0;

    llama_seq_id seq_id = -1;
@ -68,7 +74,7 @@ struct client {
    std::string prompt;
    std::string response;

-    std::vector<llama_token> tokens_prev;
+    struct llama_sampling_context * ctx_sampling = nullptr;
 };

 static void print_date_time() {
@ -145,20 +151,15 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "\n\n");
    fflush(stderr);

-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(model);
+    const int n_ctx = llama_n_ctx(ctx);

    std::vector<client> clients(n_clients);
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.tokens_prev.resize(std::max(256, params.n_predict));
-        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+        client.ctx_sampling = llama_sampling_init(params.sparams);
    }

-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
    std::vector<llama_token> tokens_system;
    tokens_system = ::llama_tokenize(ctx, k_system, true);
    const int32_t n_tokens_system = tokens_system.size();
@ -167,7 +168,7 @@ int main(int argc, char ** argv) {

    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0);
+    llama_batch batch = llama_batch_init(n_ctx, 0, 1);

    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
@ -182,13 +183,8 @@ int main(int argc, char ** argv) {
    {
        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);

-        batch.n_tokens = n_tokens_system;
-
-        for (int32_t i = 0; i < batch.n_tokens; ++i) {
-            batch.token[i]  = tokens_system[i];
-            batch.pos[i]    = i;
-            batch.seq_id[i] = 0;
-            batch.logits[i] = false;
+        for (int32_t i = 0; i < n_tokens_system; ++i) {
+            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
        }

        if (llama_decode(ctx, batch) != 0) {
@ -207,7 +203,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("Processing requests ...\n\n");

    while (true) {
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);

        // decode any currently ongoing sequences
        for (auto & client : clients) {
@ -215,15 +211,11 @@ int main(int argc, char ** argv) {
                continue;
            }

-            batch.token [batch.n_tokens] = client.sampled;
-            batch.pos   [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
-            batch.seq_id[batch.n_tokens] = client.id;
-            batch.logits[batch.n_tokens] = true;
-
-            client.n_decoded += 1;
            client.i_batch = batch.n_tokens;

-            batch.n_tokens += 1;
+            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+
+            client.n_decoded += 1;
        }

        if (batch.n_tokens == 0) {
@ -248,18 +240,14 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+                    llama_sampling_reset(client.ctx_sampling);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        batch.token [batch.n_tokens] = tokens_prompt[i];
-                        batch.pos   [batch.n_tokens] = i + n_tokens_system;
-                        batch.seq_id[batch.n_tokens] = client.id;
-                        batch.logits[batch.n_tokens] = false;
-                        batch.n_tokens += 1;
+                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
                    }

                    // extract the logits only for the last token
@ -302,11 +290,12 @@ int main(int argc, char ** argv) {

            llama_batch batch_view = {
                n_tokens,
-                batch.token  + i,
+                batch.token    + i,
                nullptr,
-                batch.pos    + i,
-                batch.seq_id + i,
-                batch.logits + i,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
                0, 0, 0, // unused
            };

@ -339,7 +328,9 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
+                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
+
+                llama_sampling_accept(client.ctx_sampling, ctx, id, true);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@ -347,11 +338,8 @@ int main(int argc, char ** argv) {
                    client.t_start_gen = ggml_time_us();
                }

-                // remember which tokens were sampled - used for repetition penalties during sampling
-                client.tokens_prev.erase(client.tokens_prev.begin());
-                client.tokens_prev.push_back(id);
-
                const std::string token_str = llama_token_to_piece(ctx, id);
+
                client.response += token_str;
                client.sampled = id;

@ -359,7 +347,7 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(ctx) ||
+                        (id == llama_token_eos(model) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(ctx);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

            const auto batch_logits = llama_get_logits(ctx);
@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(ctx);
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -8,9 +8,7 @@

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.seed = 42;
-    params.n_threads = 4;
-    params.repeat_last_n = 64;
+
    params.prompt = "The quick brown fox";

    if (!gpt_params_parse(argc, argv, params)) {
@ -24,56 +22,49 @@ int main(int argc, char ** argv) {
    }

    auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
+
+    std::string result0;
+    std::string result1;

    // init
    llama_model * model;
    llama_context * ctx;

-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
-    if (model == nullptr) {
-        return 1;
-    }
-    if (ctx == nullptr) {
-        llama_free_model(model);
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }
+
+    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);
-    auto n_prompt_tokens = tokens.size();
-    if (n_prompt_tokens < 1) {
-        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
-        llama_free(ctx);
-        llama_free_model(model);
-        return 1;
-    }

    // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
+    n_past += tokens.size();

-    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
-    n_past += n_prompt_tokens;
-
-    const size_t state_size = llama_get_state_size(ctx);
-    uint8_t * state_mem = new uint8_t[state_size];
-
-    // Save state (rng, logits, embedding and kv_cache) to file
+    // save state (rng, logits, embedding and kv_cache) to file
    {
-        FILE *fp_write = fopen("dump_state.bin", "wb");
-        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
-        fwrite(state_mem, 1, state_size, fp_write);
-        fclose(fp_write);
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
+
+        {
+            FILE *fp_write = fopen("dump_state.bin", "wb");
+            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
+            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
+            fclose(fp_write);
+        }
    }

    // save state (last tokens)
-    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
    const auto n_past_saved = n_past;

    // first run
-    printf("\n%s", params.prompt.c_str());
+    printf("\nfirst run: %s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
        auto * logits = llama_get_logits(ctx);
        auto n_vocab = llama_n_vocab(model);
+
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -82,9 +73,10 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);
-        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
+        result0 += next_token_str;
+
        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@ -102,32 +94,28 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

-    // Load state (rng, logits, embedding and kv_cache) from file
-    {
-        FILE *fp_read = fopen("dump_state.bin", "rb");
-        if (state_size != llama_get_state_size(ctx2)) {
-            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
-            return 1;
-        }
+    printf("\nsecond run: %s", params.prompt.c_str());

-        const size_t ret = fread(state_mem, 1, state_size, fp_read);
-        if (ret != state_size) {
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+
+        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        if (ret != state_mem.size()) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
            return 1;
        }

-        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        llama_set_state_data(ctx2, state_mem.data());
+
        fclose(fp_read);
    }

-    delete[] state_mem;
-
    // restore state (last tokens)
-    last_n_tokens_data = last_n_tokens_data_saved;
    n_past = n_past_saved;

    // second run
@ -142,10 +130,11 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx2, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);
-        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        result1 += next_token_str;
+
+        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
@ -154,10 +143,17 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }

-    printf("\n\n");
+    printf("\n");

    llama_free(ctx2);
    llama_free_model(model);

+    if (result0 != result1) {
+        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "\n%s : success\n", __func__);
+
    return 0;
 }
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -24,6 +24,10 @@ Command line options:
 -   `--port`: Set the port to listen. Default: `8080`.
 -   `--path`: path from which to serve static files (default examples/server/public)
 -   `--embedding`: Enable embedding extraction, Default: disabled.
+-   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+-   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
+-   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+-   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.

 ## Build

@ -106,25 +110,25 @@ node index.js

 ## API Endpoints

-   **POST** `/completion`: Given a prompt, it returns the predicted completion.
+-   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

    *Options:*

+    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
+
    `temperature`: Adjust the randomness of the generated text (default: 0.8).

    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).

    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).

-    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
+    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).

-    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
-    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
+    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

-    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
-
    `stop`: Specify a JSON array of stopping strings.
    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).

@ -158,6 +162,44 @@ node index.js

    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)

+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+
+    *Result JSON:*
+
+    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+
+    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+
+    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
+
+    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+
+    `model`: The path to the model loaded with `-m`
+
+    `prompt`: The provided `prompt`
+
+    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
+
+    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+
+    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+
+    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
+
+    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
+
+    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
+
+    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
+
+    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
+
+    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+
+    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+
+    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+
 -   **POST** `/tokenize`: Tokenize a given text.

    *Options:*
@ -188,8 +230,32 @@ node index.js

    It also accepts all the options of `/completion` except `stream` and `prompt`.

+-   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
+
 ## More examples

+### Change system prompt on runtime
+
+To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
+
+`prompt`: Specify a context that you want all connecting clients to respect.
+
+`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
+
+`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
+
+```json
+{
+    "system_prompt": {
+        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
+        "anti_prompt": "User:",
+        "assistant_name": "Assistant:"
+    }
+}
+```
+
+**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
+
 ### Interactive mode

 Check the sample in [chat.mjs](chat.mjs).
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@ -8,6 +8,7 @@ import json


 app = Flask(__name__)
+slot_id = -1

 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
 parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
    if(is_present(body, "stop")): postData["stop"] += body["stop"]
    postData["n_keep"] = -1
    postData["stream"] = stream
-
+    postData["cache_prompt"] = True
+    postData["slot_id"] = slot_id
    return postData

 def make_resData(data, chat=False, promptToken=[]):
@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
            }
        ]
    }
+    slot_id = data["slot_id"]
    if (chat):
        if (start):
            resData["choices"][0]["delta"] =  {
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@ -7,6 +7,11 @@ const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
    (_, index) => args[index - 1] === "--grammar-json-schema"
 );
+
+const no_cached_prompt = args.find(
+    (_, index) => args[index - 1] === "--no-cache-prompt"
+) ?? "false";
+
 const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");

 // Example usage: function,arguments
@ -30,6 +35,9 @@ if (grammarFile) {
    grammar = readFileSync(grammarFile, 'utf-8')
 }

+// for cached prompt
+let slot_id = -1;
+
 const API_URL = 'http://127.0.0.1:8080'

 const chat = [
@ -76,6 +84,8 @@ async function chat_completion(question) {
            top_p: 0.9,
            n_keep: n_keep,
            n_predict: 256,
+            cache_prompt: no_cached_prompt === "false",
+            slot_id: slot_id,
            stop: ["\n### Human:"], // stop completion after generating this
            grammar,
            stream: true,
@ -92,6 +102,7 @@ async function chat_completion(question) {
        const t = Buffer.from(chunk).toString('utf8')
        if (t.startsWith('data: ')) {
            const message = JSON.parse(t.substring(6))
+            slot_id = message.slot_id
            answer += message.content
            process.stdout.write(message.content)
            if (message.stop) {
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -125,6 +125,7 @@
      background-color: #222;
      color: #ddd;
    }
+
    code {
      font-family: monospace;
      padding: 0.1em 0.3em;
@ -136,7 +137,13 @@
      display: block;
    }

-    header, footer {
+    fieldset label.slim {
+      margin: 0 0.5em;
+      display: inline;
+    }
+
+    header,
+    footer {
      text-align: center;
    }

@ -145,11 +152,20 @@
      color: #888;
    }

+    .mode-chat textarea[name=prompt] {
+      height: 4.5em;
+    }
+
+    .mode-completion textarea[name=prompt] {
+      height: 10em;
+    }
+

    @keyframes loading-bg-wipe {
      0% {
        background-position: 0%;
      }
+
      100% {
        background-position: 100%;
      }
@ -168,6 +184,7 @@
        --loading-color-1: #22222200;
        --loading-color-2: #222222ff;
      }
+
      .popover-content {
        background-color: black;
      }
@ -181,15 +198,18 @@

    import { llama } from '/completion.js';
    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
+    let selected_image = false;
+    var slot_id = -1;

    const session = signal({
      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
-      type: "chat",
+      type: "chat",  // "chat" | "completion"
      char: "Llama",
      user: "User",
+      image_selected: ''
    })

    const params = signal({
@ -207,7 +227,9 @@
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
      grammar: '',
-      n_probs: 0, // no completion_probabilities
+      n_probs: 0, // no completion_probabilities,
+      image_data: [],
+      cache_prompt: true
    })

    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -257,6 +279,7 @@
      // saved templates were successfuly imported.

      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };

      //console.log(importedTemplates);
      savedUserTemplates.value = importedTemplates;
@ -281,7 +304,9 @@

    function userTemplateApply(t) {
      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
    }

    function userTemplateResetToDefaultAndApply() {
@ -365,17 +390,53 @@
      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
    }

+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data.timings;
+        }
+      }
+
+      controller.value = null;
+    }
+
    // send message to server
    const chat = async (msg) => {
      if (controller.value) {
        console.log('already running...');
        return;
      }
-      controller.value = new AbortController();

      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])

-      const prompt = template(session.value.template, {
+      let prompt = template(session.value.template, {
        message: msg,
        history: session.value.transcript.flatMap(
          ([name, data]) =>
@ -390,56 +451,67 @@
            )
        ).join("\n"),
      });
-
-      const currentMessages = [];
-      const history = session.value.transcript
-
-      const llamaParams = {
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
        ...params.value,
+        slot_id: slot_id,
        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = async () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "");
+    }

-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
-        const data = chunk.data;
-
-        if (data.stop) {
-          while (
-            currentMessages.length > 0 &&
-            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
-          ) {
-            currentMessages.pop();
-          }
-          transcriptUpdate([...history, ["{{char}}", currentMessages]])
-          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
-        } else {
-          currentMessages.push(data);
-          transcriptUpdate([...history, ["{{char}}", currentMessages]])
-        }
-
-        if (data.timings) {
-          llamaStats.value = data.timings;
-        }
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
      }
+    }

-      controller.value = null;
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
    }

    function MessageInput() {
      const message = useSignal("")

-      const stop = (e) => {
-        e.preventDefault();
-        if (controller.value) {
-          controller.value.abort();
-          controller.value = null;
-        }
-      }
-
-      const reset = (e) => {
-        stop(e);
-        transcriptUpdate([]);
-      }
-
      const submit = (e) => {
        stop(e);
        chat(message.value);
@ -467,6 +539,7 @@
          </div>
          <div class="right">
            <button type="submit" disabled=${generating.value}>Send</button>
+            <button onclick=${uploadImage}>Upload Image</button>
            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
@ -474,6 +547,19 @@
      `
    }

+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div>
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Reset</button>
+        </div>`;
+    }
+
    const ChatLog = (props) => {
      const messages = session.value.transcript;
      const container = useRef(null)
@ -497,11 +583,16 @@
            data;
          message = html`<${Markdownish} text=${template(text)} />`
        }
-        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+        if (user) {
+          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+        } else {
+          return html`<p key=${index}>${message}</p>`
+        }
      };

      return html`
        <section id="chat" ref=${container}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
          ${messages.flatMap(chatLine)}
        </section>`;
    };
@ -520,7 +611,7 @@
          const converter = new SchemaConverter(
            grammarJsonSchemaPropOrder.value
              .split(',')
-              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
          )
          converter.visit(schema, '')
          params.value = {
@ -532,7 +623,7 @@
        }
      }

-      const FloatField = ({label, max, min, name, step, value}) => {
+      const FloatField = ({ label, max, min, name, step, value }) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
@ -542,7 +633,7 @@
        `
      };

-      const IntField = ({label, max, min, name, value}) => {
+      const IntField = ({ label, max, min, name, value }) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
@ -574,18 +665,31 @@
        userTemplateAutosave()
      }, [session.value, params.value])

-      return html`
-        <form>
-          <fieldset>
-            <${UserTemplateResetButton}/>
-          </fieldset>
+      const GrammarControl = () => (
+        html`
+          <div>
+            <label for="template">Grammar</label>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+          `
+      );

-          <fieldset>
-            <div>
-              <label for="prompt">Prompt</label>
-              <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
-            </div>
-          </fieldset>
+      const PromptControlFieldSet = () => (
+        html`
+        <fieldset>
+          <div>
+            <label htmlFor="prompt">Prompt</label>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
+          </div>
+        </fieldset>
+        `
+      );
+
+      const ChatConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}

          <fieldset class="two">
            <div>
@ -609,30 +713,45 @@
              <label for="template">Chat history template</label>
              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
            </div>
+            ${GrammarControl()}
+          </fieldset>
+      `
+      );

+      const CompletionConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+          <fieldset>${GrammarControl()}</fieldset>
+        `
+      );
+
+      return html`
+        <form>
+          <fieldset class="two">
+            <${UserTemplateResetButton}/>
            <div>
-              <label for="template">Grammar</label>
-              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
-              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
-              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
            </div>
          </fieldset>

+          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
          <fieldset class="two">
-            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
-            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
-            ${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
-            ${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
-            ${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
-            ${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
          </fieldset>
          <details>
            <summary>More options</summary>
            <fieldset class="two">
-              ${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
-              ${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
-              ${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
-              ${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
+              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
            </fieldset>
            <hr />
            <fieldset class="three">
@ -641,11 +760,11 @@
                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
              </div>
-              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
-              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
            </fieldset>
            <fieldset>
-              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
            </fieldset>
          </details>
        </form>
@ -684,20 +803,20 @@
        const popoverChildren = html`
          <div class="prob-set">
            ${probs.map((p, index) => {
-              return html`
+          return html`
                <div
                  key=${index}
                  title=${`prob: ${p.prob}`}
                  style=${{
-                    padding: '0.3em',
-                    backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
-                  }}
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
                >
                  <span>${p.tok_str}: </span>
                  <span>${Math.floor(p.prob * 100)}%</span>
                </div>
              `
-            })}
+        })}
          </div>
        `

@ -776,9 +895,9 @@
              ref=${popoverRef}
              class="popover-content"
              style=${{
-                top: position.value.top,
-                left: position.value.left,
-              }}
+            top: position.value.top,
+            left: position.value.left,
+          }}
            >
              ${props.popoverChildren}
            </div>
@ -851,7 +970,7 @@
    function App(props) {

      return html`
-        <div>
+        <div class="mode-${session.value.type}">
          <header>
            <h1>llama.cpp</h1>
          </header>
@ -861,7 +980,7 @@
          </main>

          <section id="write">
-            <${MessageInput} />
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
          </section>

          <footer>
@ -877,8 +996,11 @@
 </head>

 <body>
-  <div id="container"></div>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
  <div id="portal"></div>
 </body>

 </html>
+
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
    // create a llama_batch with size 512
    // we use this object to submit token data for decoding

-    llama_batch batch = llama_batch_init(512, 0);
+    llama_batch batch = llama_batch_init(512, 0, 1);

    // evaluate the initial prompt
    batch.n_tokens = tokens_list.size();
@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of stream?
-            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                LOG_TEE("\n");

                break;
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -2,13 +2,25 @@

 #include "common.h"
 #include "llama.h"
-#include "grammar-parser.h"

 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

+struct seq_draft {
+    bool active   = false;
+    bool drafting = false;
+    bool skip     = false;
+
+    int i_batch_dft = 0;
+    std::vector<int> i_batch_tgt;
+
+    std::vector<llama_token> tokens;
+
+    struct llama_sampling_context * ctx_sampling;
+};
+
 int main(int argc, char ** argv) {
    gpt_params params;

@ -21,6 +33,13 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    // max number of parallel drafting sequences (i.e. tree branches)
+    const int n_seq_dft = params.n_parallel;
+
+    // TODO: make this configurable
+    const float p_accept = 0.80f;
+    const float p_split  = 0.10f;
+
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
@ -77,8 +96,6 @@ int main(int argc, char ** argv) {
    const auto t_enc_end = ggml_time_us();

    // the 2 models should have the same vocab
-    const int n_ctx   = llama_n_ctx(ctx_tgt);
-    const int n_vocab = llama_n_vocab(model_tgt);
    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));

    // how many tokens to draft each time
@ -91,114 +108,128 @@ int main(int argc, char ** argv) {
    int n_past_tgt = inp.size();
    int n_past_dft = inp.size();

-    std::vector<llama_token> drafted;
-
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
-
-    for (auto & id : inp) {
-        last_tokens.erase(last_tokens.begin());
-        last_tokens.push_back(id);
-    }
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
    // used to determine end of generation
    bool has_eos = false;

-    // grammar stuff
-    struct llama_grammar * grammar_dft = NULL;
-    struct llama_grammar * grammar_tgt = NULL;
+    // target model sampling context
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);

-    grammar_parser::parse_state parsed_grammar;
+    // draft sequence data
+    std::vector<seq_draft> drafts(n_seq_dft);

-    // if requested - load the grammar, error checking is omitted for brevity
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
+    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
+    params.sparams.temp = std::max(0.01f, params.sparams.temp);

-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    for (int s = 0; s < n_seq_dft; ++s) {
+        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
    }

+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+
    const auto t_dec_start = ggml_time_us();

-    while (true) {
-        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
+    // sample from the last token of the prompt
+    drafts[0].i_batch_tgt.resize(1);
+    drafts[0].i_batch_tgt[0] = 0;

-        int i_dft = 0;
+    while (true) {
+        // print current draft sequences
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
+
+            const auto & tokens = drafts[s].tokens;
+
+            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+        }
+
+        int i_dft  = 0;
+        int s_keep = 0;

        while (true) {
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+
            // sample from the target model
-            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);

-            // remember which tokens were sampled - used for repetition penalties during sampling
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);

-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
+            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());

            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
+
            printf("%s", token_str.c_str());
            fflush(stdout);

-            if (id == llama_token_eos(ctx_tgt)) {
+            if (id == llama_token_eos(model_tgt)) {
                has_eos = true;
            }

            ++n_predict;

-            // check if the draft matches the target
-            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
-                ++n_accept;
-                ++n_past_tgt;
-                ++n_past_dft;
-                ++i_dft;
-
-                continue;
-            }
-
-            // the drafted token was rejected or we are out of drafted tokens
-
-            if (i_dft < (int) drafted.size()) {
-                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
-                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
-            } else {
-                LOG("out of drafted tokens\n");
-            }
-
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
-            ++n_past_dft;
-
-            // heuristic for n_draft
+            // check if the target token matches any of the drafts
            {
-                const int  n_draft_cur  = (int) drafted.size();
-                const bool all_accepted = i_dft == n_draft_cur;
+                bool matches = false;

-                LOG("n_draft      = %d\n", n_draft);
-                LOG("n_draft_cur  = %d\n", n_draft_cur);
-                LOG("i_dft        = %d\n", i_dft);
-                LOG("all_accepted = %d\n", all_accepted);
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
+                    }

-                if (all_accepted && n_draft == n_draft_cur) {
-                    LOG(" - max drafted tokens accepted - n_draft += 8\n");
-                    n_draft = std::min(30, n_draft + 8);
-                } else if (all_accepted) {
-                    LOG(" - partially drafted tokens accepted - no change\n");
-                } else {
-                    LOG(" - drafted token rejected - n_draft -= 1\n");
-                    n_draft = std::max(2, n_draft - 1);
+                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
+                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
+
+                        s_keep = s;
+                        matches = true;
+                    } else {
+                        drafts[s].active = false;
+                    }
+                }
+
+                if (matches) {
+                    ++n_accept;
+                    ++n_past_tgt;
+                    ++n_past_dft;
+                    ++i_dft;
+
+                    continue;
                }
            }

-            drafted.clear();
-            drafted.push_back(id);
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+
+            // TODO: simplify
+            {
+                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+
+                llama_kv_cache_seq_keep(ctx_dft, s_keep);
+                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(ctx_dft, 0);
+
+                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(ctx_tgt, 0);
+            }
+
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].active = false;
+                drafts[s].tokens.clear();
+                drafts[s].i_batch_tgt.clear();
+            }
+            // note: will be erased after the speculation phase
+            drafts[0].tokens.push_back(id);
+            drafts[0].i_batch_tgt.push_back(0);
+
+            llama_batch_clear(batch_dft);
+            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            llama_decode         (ctx_dft, batch_dft);
+
+            ++n_past_dft;

            break;
        }
@ -207,72 +238,151 @@ int main(int argc, char ** argv) {
            break;
        }

-        if (grammar_tgt) {
-            if (grammar_dft) {
-                llama_grammar_free(grammar_dft);
-            }
-            grammar_dft = llama_grammar_copy(grammar_tgt);
+        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);

-            LOG("copied target grammar to draft grammar\n");
-        }
-
-        // sample n_draft tokens from the draft model using greedy decoding
+        int n_seq_cur  = 1;
        int n_past_cur = n_past_dft;
+
+        for (int s = 0; s < n_seq_dft; ++s) {
+            drafts[s].active   = false;
+            drafts[s].drafting = false;
+        }
+        drafts[0].active      = true;
+        drafts[0].drafting    = true;
+        drafts[0].i_batch_dft = 0;
+
+        llama_batch_clear(batch_tgt);
+        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+
+        // sample n_draft tokens from the draft model using tree-based sampling
        for (int i = 0; i < n_draft; ++i) {
-            float * logits = llama_get_logits(ctx_dft);
+            batch_dft.n_tokens = 0;

-            candidates.clear();
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].skip = false;
            }

-            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+            for (int s = 0; s < n_seq_dft; ++s) {
+                if (!drafts[s].drafting || drafts[s].skip) {
+                    continue;
+                }

-            if (grammar_dft != NULL) {
-                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
+                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+
+                const auto & cur_p = drafts[s].ctx_sampling->cur;
+
+                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                }
+
+                if (cur_p[0].p < p_accept) {
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
+                    drafts[s].drafting = false;
+                    continue;
+                }
+
+                std::vector<int> sa(1, s);
+
+                // attempt to split the branch if the probability is high enough
+                for (int f = 1; f < 8; ++f) {
+                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
+                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+
+                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+
+                        // all previous tokens from this branch are now also part of the new branch
+                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
+                            for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
+                                if (batch_tgt.seq_id[t][p] == s) {
+                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
+                                    batch_tgt.n_seq_id[t]++;
+                                    break;
+                                }
+                            }
+                        }
+
+                        // copy the draft state
+                        drafts[n_seq_cur].active   = true;
+                        drafts[n_seq_cur].drafting = true;
+                        drafts[n_seq_cur].skip     = true;
+
+                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
+                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
+                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
+
+                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+
+                        sa.push_back(n_seq_cur);
+
+                        n_seq_cur++;
+                    } else {
+                        break;
+                    }
+                }
+
+                // add drafted token for each sequence
+                for (int is = 0; is < (int) sa.size(); ++is) {
+                    const llama_token id = cur_p[is].id;
+
+                    const int s = sa[is];
+
+                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+
+                    drafts[s].tokens.push_back(id);
+
+                    // add unique drafted tokens to the target batch
+                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+
+                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+
+                    // add the token to the batch for batched decoding with the draft model
+                    drafts[s].i_batch_dft = batch_dft.n_tokens;
+
+                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+
+                    if (batch_tgt.n_tokens > n_draft) {
+                        drafts[s].drafting = false;
+                    }
+                }
            }

-            // computes softmax and sorts the candidates
-            llama_sample_softmax(ctx_dft, &cur_p);
-
-            for (int i = 0; i < 3; ++i) {
-                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
-            }
-
-            // TODO: better logic?
-            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
-                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
+            // no sequence is drafting anymore
+            if (batch_dft.n_tokens == 0) {
                break;
            }

-            // drafted token
-            const llama_token id = cur_p.data[0].id;
-
-            drafted.push_back(id);
+            // evaluate the drafted tokens on the draft model
+            llama_decode(ctx_dft, batch_dft);
+            ++n_past_cur;
            ++n_drafted;

-            // no need to evaluate the last drafted token, since we won't use the result
-            if (i == n_draft - 1) {
+            if (batch_tgt.n_tokens > n_draft) {
                break;
            }
-
-            // evaluate the drafted token on the draft model
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
-            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
-            ++n_past_cur;
-
-            if (grammar_dft != NULL) {
-                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
-            }
        }

        // evaluate the target model on the drafted tokens
-        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
-        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
-        ++n_past_tgt;
+        {
+            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            for (int s = 1; s < n_seq_dft; ++s) {
+                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+            }

-        // the first token is always proposed by the traget model before the speculation loop
-        drafted.erase(drafted.begin());
+            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
+            llama_decode(ctx_tgt, batch_tgt);
+            ++n_past_tgt;
+        }
+
+        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
+
+            drafts[s].tokens.erase(drafts[s].tokens.begin());
+        }
    }

    auto t_dec_end = ggml_time_us();
@ -280,9 +390,8 @@ int main(int argc, char ** argv) {
    LOG_TEE("\n\n");

    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

-    // TODO: make sure these numbers are computed correctly
    LOG_TEE("\n");
    LOG_TEE("n_draft   = %d\n", n_draft);
    LOG_TEE("n_predict = %d\n", n_predict);
@ -296,16 +405,19 @@ int main(int argc, char ** argv) {
    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx_tgt);

+    llama_sampling_free(ctx_sampling);
+    for (int s = 0; s < n_seq_dft; ++s) {
+        llama_sampling_free(drafts[s].ctx_sampling);
+    }
+
+    llama_batch_free(batch_dft);
+
    llama_free(ctx_tgt);
    llama_free_model(model_tgt);

    llama_free(ctx_dft);
    llama_free_model(model_dft);

-    if (grammar_dft != NULL) {
-        llama_grammar_free(grammar_dft);
-        llama_grammar_free(grammar_tgt);
-    }
    llama_backend_free();

    fprintf(stderr, "\n\n");
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -253,13 +253,14 @@ static void init_model(struct my_llama_model * model) {
    set_param_model(model);

    // measure data size
-    struct ggml_allocr * alloc = NULL;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    alloc_model(alloc, model);
+    size_t size = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
+    }

    // allocate data
-    model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
-    ggml_allocr_free(alloc);
+    struct ggml_allocr * alloc = NULL;
+    model->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
    alloc_model(alloc, model);
    ggml_allocr_free(alloc);
@ -1094,11 +1095,9 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);

    // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
+                            tensor_alignment;
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));

    // allocate input tensors