From 11dc1091f64b24ca6d643acc6d0051117ba60161 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Sat, 14 Oct 2023 13:52:44 +0300
Subject: [PATCH 01/28] Honor -ngl option for Cuda offloading in llava (#3621)

---
 examples/llava/llava.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 14dacc780..8384d9d78 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -79,7 +79,13 @@ int main(int argc, char ** argv) {
 
     llama_backend_init(params.numa);
 
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params              = llama_model_default_params();
+                       model_params.n_gpu_layers = params.n_gpu_layers;
+                       model_params.main_gpu     = params.main_gpu;
+                       model_params.tensor_split = params.tensor_split;
+                       model_params.use_mmap     = params.use_mmap;
+                       model_params.use_mlock    = params.use_mlock;
+
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);

From 11bff290458f12f020b588792707f76ec658a27a Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Sun, 15 Oct 2023 02:32:06 -0400
Subject: [PATCH 02/28] MPT : support GQA for replit-code-v1.5 (#3627)

---
 convert-mpt-hf-to-gguf.py | 2 ++
 llama.cpp                 | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py
index 73a4932f7..19a66820d 100755
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -98,6 +98,8 @@ gguf_writer.add_embedding_length(hparams["d_model"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
 gguf_writer.add_head_count(hparams["n_heads"])
+if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
+    gguf_writer.add_head_count_kv(kv_n_heads)
 gguf_writer.add_layer_norm_eps(1e-05)
 if hparams["attn_config"]["clip_qkv"] is not None:
     gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
diff --git a/llama.cpp b/llama.cpp
index 2cd2dad7f..5329bd828 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2839,8 +2839,8 @@ static void llm_load_tensors(
                         auto & layer = model.layers[i];
 
                         layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
 
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
@@ -5368,7 +5368,7 @@ static struct ggml_cgraph * llm_build_mpt(
     const int64_t n_layer     = hparams.n_layer;
     const int64_t n_ctx       = cparams.n_ctx;
     const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
+    const int64_t n_head_kv   = hparams.n_head_kv;
     const int64_t n_embd_head = hparams.n_embd_head();
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 

From 940efa95fec0b8a98c226a889d2ad839dfeeae0d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Oct 2023 23:58:00 +0300
Subject: [PATCH 03/28] llava : fix tokenization to not add bos between image
 embeddings and user prompt (#3645)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* llava : fix tokenization to not add bos after system prompt

* set seed

---------

Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
---
 examples/llava/llava-utils.h |  4 ++--
 examples/llava/llava.cpp     | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h
index 79e237c86..4e71351dd 100644
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -49,9 +49,9 @@ inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
     return eval_tokens(ctx_llama, tokens, 1, n_past);
 }
 
-inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
+inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
     std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
     eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
     return true;
 }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 8384d9d78..b24cb2e6f 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
     ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.seed            = params.seed;
 
     llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
 
@@ -106,7 +107,8 @@ int main(int argc, char ** argv) {
     }
 
     // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+
     if (n_img_embd != n_llama_embd) {
         printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
 
@@ -125,14 +127,14 @@ int main(int argc, char ** argv) {
 
     const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
 
-    // GG: are we sure that the should be a trailing whitespace at the end of this string?
-    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
+    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
     eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
-    eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
-    eval_string(ctx_llama, "\nASSISTANT:",        params.n_batch, &n_past);
+    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
 
     // generate the response
 
+    printf("\n");
+    printf("prompt: '%s'\n", params.prompt.c_str());
     printf("\n");
 
     for (int i = 0; i < max_tgt_len; i++) {

From 281ef73c258cc1eebec8a64264240432d5878c4b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Oct 2023 09:19:28 +0300
Subject: [PATCH 04/28] k-quants : fix quantization ranges (#3646)

---
 k_quants.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/k_quants.c b/k_quants.c
index 558f5fda8..e168a87bb 100644
--- a/k_quants.c
+++ b/k_quants.c
@@ -462,12 +462,9 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
 }
 
 size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
 
-    // TODO - collect histograms - although, at a second thought, I don't really care about them
-    (void)hist;
-
-    for (int j = 0; j < nb; j += k) {
+    for (int j = 0; j < n; j += k) {
         block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
         quantize_row_q2_K_reference(src + j, y, k);
     }
@@ -678,12 +675,9 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
 }
 
 size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
 
-    // TODO - collect histograms - although, at a second thought, I don't really care about them
-    (void)hist;
-
-    for (int j = 0; j < nb; j += k) {
+    for (int j = 0; j < n; j += k) {
         block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
         quantize_row_q3_K_reference(src + j, y, k);
     }
@@ -846,9 +840,9 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
 
 size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
     (void)hist; // TODO: collect histograms
-    for (int j = 0; j < nb; j += k) {
+
+    for (int j = 0; j < n; j += k) {
         block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
         quantize_row_q4_K_reference(src + j, y, k);
     }
@@ -1052,9 +1046,9 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
 
 size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-    (void)hist;
-    for (int j = 0; j < nb; j += k) {
+    (void)hist; // TODO: collect histograms
+
+    for (int j = 0; j < n; j += k) {
         block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
         quantize_row_q5_K_reference(src + j, y, k);
     }
@@ -1200,11 +1194,9 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
 
 size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK_K == 0);
-    const int nb = k / QK_K;
+    (void)hist; // TODO: collect histograms
 
-    (void)hist; // TODO
-
-    for (int j = 0; j < nb; j += k) {
+    for (int j = 0; j < n; j += k) {
         block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
         quantize_row_q6_K_reference(src + j, y, k);
     }

From 1a159553f921a9209fed8c714494e57b3649f232 Mon Sep 17 00:00:00 2001
From: staviq <staviq@gmail.com>
Date: Tue, 17 Oct 2023 17:11:01 +0200
Subject: [PATCH 05/28] tokenizer : special token handling (#3538)

* Rewrite special token handling from #1931

* shorten param name, add st verification by type

* use offsets instead of copy by substr

* formatting, remove copying iterator on delete

* llama : normalize code-style

* swift fix

* print pfx/sfx if verb, main: split pfx input sfx

* dont add space when using special tokens

* minor : comment + spacing

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp                         |  12 +-
 common/common.h                           |   6 +-
 common/train.cpp                          |   8 +-
 examples/batched.swift/Sources/main.swift |   2 +-
 examples/main/main.cpp                    |  40 ++-
 llama.cpp                                 | 290 +++++++++++++++++++++-
 llama.h                                   |  13 +-
 7 files changed, 332 insertions(+), 39 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9c4f7df20..3e4b8a8cb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -879,21 +879,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 std::vector<llama_token> llama_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
-                        bool   add_bos) {
-    return llama_tokenize(llama_get_model(ctx), text, add_bos);
+                        bool   add_bos,
+                        bool   special) {
+    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
 }
 
 std::vector<llama_token> llama_tokenize(
     const struct llama_model * model,
            const std::string & text,
-                        bool   add_bos) {
+                        bool   add_bos,
+                        bool   special) {
     // upper limit for the number of tokens
     int n_tokens = text.length() + add_bos;
     std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
diff --git a/common/common.h b/common/common.h
index 36fd44166..08c603231 100644
--- a/common/common.h
+++ b/common/common.h
@@ -137,12 +137,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 std::vector<llama_token> llama_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
-                        bool   add_bos);
+                        bool   add_bos,
+                        bool   special = false);
 
 std::vector<llama_token> llama_tokenize(
     const struct llama_model * model,
            const std::string & text,
-                        bool   add_bos);
+                        bool   add_bos,
+                        bool   special = false);
 
 // tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
diff --git a/common/train.cpp b/common/train.cpp
index 35a4cf9e6..972eaefe0 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -863,7 +863,7 @@ size_t tokenize_file(
             (int) buf.size(),
             out_tokens.data(),
             (int) out_tokens.size(),
-            false);
+            false, false);
         if (n_tokens < 0) {
             out_tokens.resize(-n_tokens);
             n_tokens = llama_tokenize(
@@ -872,7 +872,7 @@ size_t tokenize_file(
                 (int) buf.size(),
                 out_tokens.data(),
                 (int) out_tokens.size(),
-                false);
+                false, false);
         }
         if (n_tokens >= 0) {
             out_tokens.resize(n_tokens);
@@ -966,7 +966,7 @@ size_t tokenize_file(
                     (int) buf_sample.size(),
                     tok_sample.data(),
                     (int) tok_sample.size(),
-                    false);
+                    false, false);
                 if (n_tokens < 0) {
                     tok_sample.resize(-n_tokens);
                     n_tokens = llama_tokenize(llama_get_model(lctx),
@@ -974,7 +974,7 @@ size_t tokenize_file(
                         (int) buf_sample.size(),
                         tok_sample.data(),
                         (int) tok_sample.size(),
-                        false);
+                        false, false);
                     GGML_ASSERT(n_tokens >= 0);
                 }
                 GGML_ASSERT(n_tokens <= (int) tok_sample.size());
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 938f30512..05d1bb9d0 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -209,7 +209,7 @@ llama_print_timings(context)
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     let n_tokens = text.count + (add_bos ? 1 : 0)
     let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
     var swiftTokens: [llama_token] = []
     for i in 0 ..< tokenCount {
         swiftTokens.append(tokens[Int(i)])
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 55f73356f..a5fb65548 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -238,7 +238,7 @@ int main(int argc, char ** argv) {
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
     } else {
         LOG("use session tokens\n");
         embd_inp = session_tokens;
@@ -260,10 +260,10 @@ int main(int argc, char ** argv) {
     if (ctx_guidance) {
         LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
 
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
         LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
 
         original_prompt_len = original_inp.size();
@@ -320,8 +320,8 @@ int main(int argc, char ** argv) {
     }
 
     // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
 
     LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
     LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
@@ -383,6 +383,12 @@ int main(int argc, char ** argv) {
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
                 LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                if (params.verbose_prompt) {
+                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
             }
         }
 
@@ -392,10 +398,22 @@ int main(int argc, char ** argv) {
 
         if (!params.input_prefix.empty()) {
             LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
         }
 
         if (!params.input_suffix.empty()) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
         }
     }
     LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
@@ -717,7 +735,7 @@ int main(int argc, char ** argv) {
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
@@ -744,8 +762,7 @@ int main(int argc, char ** argv) {
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
                     LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    printf("%s", params.input_prefix.c_str());
                 }
 
                 // color user input only
@@ -767,7 +784,6 @@ int main(int argc, char ** argv) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
                         LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
                         printf("%s", params.input_suffix.c_str());
                     }
 
@@ -782,10 +798,14 @@ int main(int argc, char ** argv) {
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
 
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
+                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
 
+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
                     // instruct mode: insert response suffix
                     if (params.instruct) {
diff --git a/llama.cpp b/llama.cpp
index 5329bd828..82b7638ae 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -75,6 +75,7 @@
 #include <thread>
 #include <unordered_map>
 #include <set>
+#include <forward_list>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -1183,6 +1184,8 @@ struct llama_vocab {
     std::unordered_map<token, id> token_to_id;
     std::vector<token_data>       id_to_token;
 
+    std::unordered_map<token, id> special_tokens_cache;
+
     std::map<std::pair<std::string, std::string>, int> bpe_ranks;
 
     // default LLaMA special tokens
@@ -2125,7 +2128,7 @@ static void llm_load_hparams(
 }
 
 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 
 static void llm_load_vocab(
@@ -2241,6 +2244,101 @@ static void llm_load_vocab(
     GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
     GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
     GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
+
+    // build special tokens cache
+    {
+        // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
+        //  and will always be correctly labeled in 'added_tokens.json' etc.
+        // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
+        //  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
+        //  are special tokens.
+        // From testing, this appears to corelate 1:1 with special tokens.
+        //
+
+        // Counting special tokens and verifying in only one direction
+        //  is sufficient to detect difference in those two sets.
+        //
+        uint32_t special_tokens_count_by_type = 0;
+        uint32_t special_tokens_count_from_verification = 0;
+
+        bool special_tokens_definition_mismatch = false;
+
+        for (const auto & t : vocab.token_to_id) {
+            const auto & token = t.first;
+            const auto & id    = t.second;
+
+            // Count all non-normal tokens in the vocab while iterating
+            if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
+                special_tokens_count_by_type++;
+            }
+
+            // Skip single character tokens
+            if (token.length() > 1) {
+                bool is_tokenizable = false;
+
+                // Split token string representation in two, in all possible ways
+                //  and check if both halves can be matched to a valid token
+                for (unsigned i = 1; i < token.length();) {
+                    const auto left  = token.substr(0, i);
+                    const auto right = token.substr(i);
+
+                    // check if we didnt partition in the middle of a utf sequence
+                    auto utf = utf8_len(left.at(left.length() - 1));
+
+                    if (utf == 1) {
+                        if (vocab.token_to_id.find(left)  != vocab.token_to_id.end() &&
+                            vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
+                            is_tokenizable = true;
+                            break;
+                        }
+                        i++;
+                    } else {
+                        // skip over the rest of multibyte utf sequence
+                        i += utf - 1;
+                    }
+                }
+
+                if (!is_tokenizable) {
+                    // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
+                    //  it's faster to re-filter them here, since there are way less candidates now
+
+                    // Calculate a total "utf" length of a token string representation
+                    size_t utf8_str_len = 0;
+                    for (unsigned i = 0; i < token.length();) {
+                        utf8_str_len++;
+                        i += utf8_len(token.at(i));
+                    }
+
+                    // And skip the ones which are one character
+                    if (utf8_str_len > 1) {
+                        // At this point what we have left are special tokens only
+                        vocab.special_tokens_cache[token] = id;
+
+                        // Count manually found special tokens
+                        special_tokens_count_from_verification++;
+
+                        // If this manually found special token is not marked as such, flag a mismatch
+                        if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
+                            special_tokens_definition_mismatch = true;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
+            fprintf(stderr, "%s: warning: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
+                __func__,
+                special_tokens_count_from_verification, vocab.id_to_token.size(),
+                special_tokens_count_by_type, vocab.id_to_token.size()
+            );
+        } else {
+            fprintf(stderr, "%s: Special tokens definition check successful ( %u/%zu ).\n",
+                __func__,
+                special_tokens_count_from_verification, vocab.id_to_token.size()
+            );
+        }
+    }
 }
 
 static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -6464,7 +6562,137 @@ private:
     llm_bigram_bpe::queue work_queue;
 };
 
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant{
+    fragment_buffer_variant(llama_vocab::id _token)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
+        token(_token),
+        raw_text(_dummy),
+        offset(0),
+        length(0){}
+    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
+        token((llama_vocab::id)-1),
+        raw_text(_raw_text),
+        offset(_offset),
+        length(_length){
+            GGML_ASSERT( _offset >= 0 );
+            GGML_ASSERT( _length >= 1 );
+            GGML_ASSERT( offset + length <= raw_text.length() );
+        }
+
+    const FRAGMENT_BUFFER_VARIANT_TYPE type;
+    const llama_vocab::id token;
+    const std::string _dummy;
+    const std::string & raw_text;
+    const uint64_t offset;
+    const uint64_t length;
+};
+
+// #define PRETOKENIZERDEBUG
+
+static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
+{
+    // for each special token
+    for (const auto & st: vocab.special_tokens_cache) {
+        const auto & special_token = st.first;
+        const auto & special_id    = st.second;
+
+        // for each text fragment
+        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
+        while (it != buffer.end()) {
+            auto & fragment = (*it);
+
+            // if a fragment is text ( not yet processed )
+            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                auto * raw_text = &(fragment.raw_text);
+
+                auto raw_text_base_offset = fragment.offset;
+                auto raw_text_base_length = fragment.length;
+
+                // loop over the text
+                while (true) {
+                    // find the first occurence of a given special token in this fragment
+                    //  passing offset argument only limit the "search area" but match coordinates
+                    //  are still relative to the source full raw_text
+                    auto match = raw_text->find(special_token, raw_text_base_offset);
+
+                    // no occurences found, stop processing this fragment for a given special token
+                    if (match == std::string::npos) break;
+
+                    // check if match is within bounds of offset <-> length
+                    if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
+
+#ifdef PRETOKENIZERDEBUG
+                    fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    auto source = std::distance(buffer.begin(), it);
+
+                    // if match is further than base offset
+                    //  then we have some text to the left of it
+                    if (match > raw_text_base_offset) {
+                        // left
+                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
+                        const int64_t left_reminder_length = match - raw_text_base_offset;
+                        buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+#endif
+                        it++;
+                    }
+
+                    // special token
+                    buffer.emplace_after(it, special_id);
+                    it++;
+
+                    // right
+                    if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
+                        const int64_t right_reminder_offset = match + special_token.length();
+                        const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
+                        buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+#endif
+
+                        it++;
+
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                        }
+
+                        // repeat for the right side
+                        raw_text_base_offset = right_reminder_offset;
+                        raw_text_base_length = right_reminder_length;
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    } else {
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                        }
+                        break;
+                    }
+                }
+            }
+            it++;
+        }
+    }
+}
+
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
     std::vector<llama_vocab::id> output;
 
     // OG tokenizer behavior:
@@ -6480,20 +6708,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
         return output;
     }
 
+    std::forward_list<fragment_buffer_variant> fragment_buffer;
+    fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
+
+    if (special) tokenizer_st_partition( vocab, fragment_buffer );
+
     switch (vocab.type) {
         case LLAMA_VOCAB_TYPE_SPM:
             {
-                // without adding this leading whitespace, we do not get the same results as the original tokenizer
-                raw_text = " " + raw_text;
+                for (const auto & fragment: fragment_buffer)
+                {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
+                    {
+                        // without adding this leading whitespace, we do not get the same results as the original tokenizer
 
-                llm_tokenizer_spm tokenizer(vocab);
-                llama_escape_whitespace(raw_text);
-                tokenizer.tokenize(raw_text, output);
+                        // TODO: It's likely possible to get rid of this string copy entirely
+                        //  by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
+                        //  and passing 'add space prefix' as bool argument
+                        //
+                        auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        llm_tokenizer_spm tokenizer(vocab);
+                        llama_escape_whitespace(raw_text);
+                        tokenizer.tokenize(raw_text, output);
+                    }
+                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                    {
+                        output.push_back(fragment.token);
+                    }
+                }
             } break;
         case LLAMA_VOCAB_TYPE_BPE:
             {
-                llm_tokenizer_bpe tokenizer(vocab);
-                tokenizer.tokenize(raw_text, output);
+                for (const auto & fragment: fragment_buffer)
+                {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
+                    {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        llm_tokenizer_bpe tokenizer(vocab);
+                        tokenizer.tokenize(raw_text, output);
+                    }
+                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                    {
+                        output.push_back(fragment.token);
+                    }
+                }
             } break;
     }
 
@@ -9407,15 +9673,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) {
     return ctx->model.vocab.special_eot_id;
 }
 
-
 int llama_tokenize(
     const struct llama_model * model,
                   const char * text,
                          int   text_len,
                  llama_token * tokens,
                          int   n_max_tokens,
-                        bool   add_bos) {
-    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
+                        bool   add_bos,
+                        bool   special) {
+    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
 
     if (n_max_tokens < (int) res.size()) {
         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
diff --git a/llama.h b/llama.h
index a78015ada..b13f23123 100644
--- a/llama.h
+++ b/llama.h
@@ -511,17 +511,20 @@ extern "C" {
     // Tokenization
     //
 
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
+    /// @details Convert the provided text into tokens.
+    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
+    /// @return Returns the number of tokens on success, no more than n_max_tokens
+    /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
+    ///                Does not insert a leading space.
     LLAMA_API int llama_tokenize(
         const struct llama_model * model,
                       const char * text,
                              int   text_len,
                      llama_token * tokens,
                              int   n_max_tokens,
-                            bool   add_bos);
+                            bool   add_bos,
+                            bool   special);
 
     // Token Id -> Piece.
     // Uses the vocabulary in the provided context.

From 5fe268a4d9ce09f3a6c77239af583d3a8e49d54c Mon Sep 17 00:00:00 2001
From: ldwang <ftgreat@163.com>
Date: Tue, 17 Oct 2023 23:52:33 +0800
Subject: [PATCH 06/28] readme : add Aquila2 links (#3610)

Signed-off-by: ldwang <ftgreat@gmail.com>
Co-authored-by: ldwang <ftgreat@gmail.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 60f14a1fb..56372865b 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
 - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
+- [X] [Aquila2-7B](https://huggingface.co/BAAI/Aquila2-7B) / [AquilaChat2-7B](https://huggingface.co/BAAI/AquilaChat2-7B) / [AquilaChat2-34B](https://huggingface.co/BAAI/AquilaChat2-34B) / [Aquila2-34B](https://huggingface.co/BAAI/Aquila2-34B)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)

From 1142013da40e98946a109b141dd858f0ed996051 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Oct 2023 19:12:46 +0300
Subject: [PATCH 07/28] save-load-state : fix example + add ci test (#3655)

* save-load-state : fix example (close #3606)

* ci : add test for save-load-state example

ggml-ci
---
 ci/run.sh                                    |  6 ++
 examples/save-load-state/save-load-state.cpp | 97 ++++++++++----------
 2 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index 34c9129c1..2e3343831 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -208,6 +208,8 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
     function check_ppl {
         qnt="$1"
         ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -296,6 +298,7 @@ function gg_sum_open_llama_3b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
     gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@@ -382,6 +385,8 @@ function gg_run_open_llama_7b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
     function check_ppl {
         qnt="$1"
         ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -470,6 +475,7 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
     gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
     gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
     gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
     gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
     #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index f9e3c98a3..38d05f4d3 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -8,10 +8,7 @@
 
 int main(int argc, char ** argv) {
     gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
-    params.seed = 42;
-    params.n_threads = 4;
-    sparams.repeat_last_n = 64;
+
     params.prompt = "The quick brown fox";
 
     if (!gpt_params_parse(argc, argv, params)) {
@@ -25,56 +22,49 @@ int main(int argc, char ** argv) {
     }
 
     auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
+
+    std::string result0;
+    std::string result1;
 
     // init
     llama_model * model;
     llama_context * ctx;
 
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
-    if (model == nullptr) {
-        return 1;
-    }
-    if (ctx == nullptr) {
-        llama_free_model(model);
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
         return 1;
     }
+
+    // tokenize prompt
     auto tokens = llama_tokenize(ctx, params.prompt, true);
-    auto n_prompt_tokens = tokens.size();
-    if (n_prompt_tokens < 1) {
-        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
-        llama_free(ctx);
-        llama_free_model(model);
-        return 1;
-    }
 
     // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
+    n_past += tokens.size();
 
-    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
-    n_past += n_prompt_tokens;
-
-    const size_t state_size = llama_get_state_size(ctx);
-    uint8_t * state_mem = new uint8_t[state_size];
-
-    // Save state (rng, logits, embedding and kv_cache) to file
+    // save state (rng, logits, embedding and kv_cache) to file
     {
-        FILE *fp_write = fopen("dump_state.bin", "wb");
-        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
-        fwrite(state_mem, 1, state_size, fp_write);
-        fclose(fp_write);
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
+
+        {
+            FILE *fp_write = fopen("dump_state.bin", "wb");
+            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
+            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
+            fclose(fp_write);
+        }
     }
 
     // save state (last tokens)
-    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
     const auto n_past_saved = n_past;
 
     // first run
-    printf("\n%s", params.prompt.c_str());
+    printf("\nfirst run: %s", params.prompt.c_str());
 
     for (auto i = 0; i < params.n_predict; i++) {
         auto * logits = llama_get_logits(ctx);
         auto n_vocab = llama_n_vocab(model);
+
         std::vector<llama_token_data> candidates;
         candidates.reserve(n_vocab);
         for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -83,9 +73,10 @@ int main(int argc, char ** argv) {
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
         auto next_token = llama_sample_token(ctx, &candidates_p);
         auto next_token_str = llama_token_to_piece(ctx, next_token);
-        last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
+        result0 += next_token_str;
+
         if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx);
@@ -103,32 +94,28 @@ int main(int argc, char ** argv) {
     // make new context
     auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
 
-    // Load state (rng, logits, embedding and kv_cache) from file
-    {
-        FILE *fp_read = fopen("dump_state.bin", "rb");
-        if (state_size != llama_get_state_size(ctx2)) {
-            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
-            return 1;
-        }
+    printf("\nsecond run: %s", params.prompt.c_str());
 
-        const size_t ret = fread(state_mem, 1, state_size, fp_read);
-        if (ret != state_size) {
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+
+        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        if (ret != state_mem.size()) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
             llama_free(ctx2);
             llama_free_model(model);
             return 1;
         }
 
-        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        llama_set_state_data(ctx2, state_mem.data());
+
         fclose(fp_read);
     }
 
-    delete[] state_mem;
-
     // restore state (last tokens)
-    last_n_tokens_data = last_n_tokens_data_saved;
     n_past = n_past_saved;
 
     // second run
@@ -143,10 +130,11 @@ int main(int argc, char ** argv) {
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
         auto next_token = llama_sample_token(ctx2, &candidates_p);
         auto next_token_str = llama_token_to_piece(ctx2, next_token);
-        last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        result1 += next_token_str;
+
+        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx2);
             llama_free_model(model);
@@ -155,10 +143,17 @@ int main(int argc, char ** argv) {
         n_past += 1;
     }
 
-    printf("\n\n");
+    printf("\n");
 
     llama_free(ctx2);
     llama_free_model(model);
 
+    if (result0 != result1) {
+        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "\n%s : success\n", __func__);
+
     return 0;
 }

From 3ad1e3f1a10c1f66b4f1cd7510e0977fadbc0dfd Mon Sep 17 00:00:00 2001
From: coezbek <c.oezbek@gmail.com>
Date: Tue, 17 Oct 2023 18:51:02 +0200
Subject: [PATCH 08/28] server : documentation of JSON return value of
 /completion endpoint (#3632)

* Added documentation of JSON return value of /completion endpoint

* Update examples/server/README.md

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/server/README.md | 42 +++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 8a079ae26..9737010d3 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -106,25 +106,25 @@ node index.js
 
 ## API Endpoints
 
--   **POST** `/completion`: Given a prompt, it returns the predicted completion.
+-   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
 
     *Options:*
 
+    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
+
     `temperature`: Adjust the randomness of the generated text (default: 0.8).
 
     `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
 
     `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
 
-    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
+    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
 
-    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
-    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
+    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
 
     `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
 
-    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
-
     `stop`: Specify a JSON array of stopping strings.
     These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
 
@@ -158,6 +158,36 @@ node index.js
 
     `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
 
+    *Result JSON:*
+
+    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+
+    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+
+    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) 
+
+    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+
+    `model`: The path to the model loaded with `-m`
+
+    `prompt`: The provided `prompt`
+
+    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
+
+    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+
+    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+
+    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
+
+    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
+
+    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
+
+    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
+
+    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)  
+
 -   **POST** `/tokenize`: Tokenize a given text.
 
     *Options:*

From e74c705e15cd228ad696c4a3cdea6d6fb4ff434c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Oct 2023 19:52:53 +0300
Subject: [PATCH 09/28] editorconfig : remove trailing spaces

---
 examples/server/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 9737010d3..9f0ace3d7 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -164,7 +164,7 @@ node index.js
 
     `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
 
-    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) 
+    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
 
     `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
 
@@ -186,7 +186,7 @@ node index.js
 
     `tokens_evaluated`: Number of tokens evaluated in total from the prompt
 
-    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)  
+    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
 
 -   **POST** `/tokenize`: Tokenize a given text.
 

From a5e8c1d8c71f01d98ae2ec63a57c118664f9764d Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 17 Oct 2023 19:00:58 +0200
Subject: [PATCH 10/28] train-text-from-scratch : fix assert failure in
 ggml-alloc (#3618)

---
 .../train-text-from-scratch.cpp               | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index be693b3ac..1ce6cef29 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -253,13 +253,14 @@ static void init_model(struct my_llama_model * model) {
     set_param_model(model);
 
     // measure data size
-    struct ggml_allocr * alloc = NULL;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    alloc_model(alloc, model);
+    size_t size = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
+    }
 
     // allocate data
-    model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
-    ggml_allocr_free(alloc);
+    struct ggml_allocr * alloc = NULL;
+    model->data.resize(size + tensor_alignment);
     alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
     alloc_model(alloc, model);
     ggml_allocr_free(alloc);
@@ -1094,11 +1095,9 @@ int main(int argc, char ** argv) {
     struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
     // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
+                            tensor_alignment;
     printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
 
     // allocate input tensors

From 40e5ce054f4c4fa555e4510ea5f760bb29185332 Mon Sep 17 00:00:00 2001
From: shibe2 <shibe@tuta.io>
Date: Wed, 11 Oct 2023 21:30:06 +0400
Subject: [PATCH 11/28] CLBlast: Fix temporary buffer size for f16 conversion
 (wsize)

Fix buffer overflow.
Reduce the size to fit just one 2D slice.
Assert sufficient size.
---
 ggml-opencl.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 33d0691eb..22fd0e3a7 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1568,7 +1568,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     ggml_cl_pool_free(d_D, d_size);
 }
 
-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
     GGML_ASSERT(fp16_support);
 
     const int64_t ne00 = src0->ne[0];
@@ -1598,6 +1598,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
     const int y_ne = ne11 * ne10;
     const int d_ne = ne11 * ne01;
 
+    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
+    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
+    ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
+
     size_t x_size;
     size_t y_size;
     size_t d_size;
@@ -1634,7 +1638,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
 
             // convert src1 to fp16
             // TODO: use multiple threads
-            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
             char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
             if (src1_cont_rows) {
                 if (src1_cont_cols) {
@@ -1897,8 +1900,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
 }
 
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
     }
     return 0;
 }

From 8402566a7c436bfbde8e7b0461faee50298106a0 Mon Sep 17 00:00:00 2001
From: BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
Date: Tue, 17 Oct 2023 14:13:21 -0400
Subject: [PATCH 12/28] readme : update hot-topics & models, detail windows
 release in usage (#3615)

* Update README.md

* Update README.md

* Update README.md

* move "Running on Windows" section below "Prepare data and run"

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 README.md | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 56372865b..4fd4bd427 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
-
+- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
 - ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
 - Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
   **Devs should become familiar with the new API**
@@ -89,16 +89,17 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
-- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
+- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
-- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
-- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
-- [X] [Aquila2-7B](https://huggingface.co/BAAI/Aquila2-7B) / [AquilaChat2-7B](https://huggingface.co/BAAI/AquilaChat2-7B) / [AquilaChat2-34B](https://huggingface.co/BAAI/AquilaChat2-34B) / [Aquila2-34B](https://huggingface.co/BAAI/Aquila2-34B)
+- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
+- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+
 
 **Bindings:**
 
@@ -207,7 +208,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 
 ## Usage
 
-Here are the steps for the LLaMA-7B model.
+Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
 
 ### Get the Code
 
@@ -574,6 +575,18 @@ python3 convert.py models/7B/
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 
+### Running on Windows with prebuilt binaries
+
+You will find prebuilt Windows binaries on the release page.
+
+Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
+
+From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
+
+```
+.\main -m llama-2-7b.Q4_0.gguf -n 128
+```
+
 ### Memory/Disk Requirements
 
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.

From e1675d133c31e1c8de2f06be7164e12c0ba6cf2c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Oct 2023 22:34:26 +0300
Subject: [PATCH 13/28] llama : avoid fprintf in favor of LLAMA_LOG (#3538)

---
 examples/main/main.cpp | 2 +-
 llama.cpp              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index a5fb65548..7313d06a0 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -799,7 +799,7 @@ int main(int argc, char ** argv) {
                     }
 
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
 
diff --git a/llama.cpp b/llama.cpp
index 82b7638ae..37df88779 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2327,13 +2327,13 @@ static void llm_load_vocab(
         }
 
         if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
-            fprintf(stderr, "%s: warning: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
+            LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
                 __func__,
                 special_tokens_count_from_verification, vocab.id_to_token.size(),
                 special_tokens_count_by_type, vocab.id_to_token.size()
             );
         } else {
-            fprintf(stderr, "%s: Special tokens definition check successful ( %u/%zu ).\n",
+            LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
                 __func__,
                 special_tokens_count_from_verification, vocab.id_to_token.size()
             );

From cb33f43a2a9f5a5a5f8d290dd97c625d9ba97a2f Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 17 Oct 2023 22:24:50 +0200
Subject: [PATCH 14/28] fix embeddings when using CUDA (#3657)

---
 llama.cpp | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 37df88779..04a779e04 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5903,6 +5903,13 @@ static int llama_decode_internal(
 
     ggml_allocr_alloc_graph(lctx.alloc, gf);
 
+    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+
+    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
+    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
+
+
 #ifdef GGML_USE_CUBLAS
     for (int i = 0; i < gf->n_leafs; i++) {
         ggml_tensor * node = gf->leafs[i];
@@ -5920,6 +5927,12 @@ static int llama_decode_internal(
     }
 
     ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
+
+    // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
+    if (!lctx.embedding.empty()) {
+        embeddings->backend = GGML_BACKEND_CPU;
+    }
+    res->backend = GGML_BACKEND_CPU;
 #endif
 
     // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -5944,12 +5957,6 @@ static int llama_decode_internal(
         n_threads = 1;
     }
 
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
-
-    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
-    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
-
 #if GGML_USE_MPI
     const int64_t n_layer = hparams.n_layer;
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);

From 1117d06607d2d885640ac501f05f0aae5494e2c5 Mon Sep 17 00:00:00 2001
From: shibe2 <shibe@tuta.io>
Date: Wed, 18 Oct 2023 16:09:22 +0400
Subject: [PATCH 15/28] opencl : fix element-wise multiplication (#3656)

---
 ggml-opencl.cpp | 67 ++++++++++++++-----------------------------------
 1 file changed, 19 insertions(+), 48 deletions(-)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 22fd0e3a7..67ac20eac 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1395,75 +1395,46 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
     const int64_t ne01 = src0->ne[1];
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
-    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
     const int64_t ne12 = src1->ne[2];
     const int64_t ne13 = src1->ne[3];
-    const int64_t nb10 = src1->nb[0];
     const int nb2  = dst->nb[2];
     const int nb3  = dst->nb[3];
     size_t x_size;
     size_t d_size;
 
-    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
+    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
     cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
+    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
 
 
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
-            const int i0 = i03*ne02 + i02;
-
             cl_event ev;
 
             // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
 
-            if (nb10 == sizeof(float)) {
-                // Contiguous, avoid overhead from queueing many kernel runs
-                const int64_t i13 = i03%ne13;
-                const int64_t i12 = i02%ne12;
-                const int i1 = i13*ne12*ne11 + i12*ne11;
+            const int64_t i13 = i03%ne13;
+            const int64_t i12 = i02%ne12;
+            const int i1 = i13*ne12*ne11 + i12*ne11;
 
-                cl_int x_offset = 0;
-                cl_int y_offset = i1*ne10;
-                cl_int d_offset = 0;
+            cl_int x_offset = 0;
+            cl_int y_offset = i1*ne10;
+            cl_int d_offset = 0;
 
-                size_t global = ne00 * ne01;
-                cl_int ky = ne10;
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-                CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-            } else {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const int64_t i13 = i03%ne13;
-                    const int64_t i12 = i02%ne12;
-                    const int64_t i11 = i01%ne11;
-                    const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
+            size_t global = ne00 * ne01;
+            cl_int ky = ne10 * ne11;
 
-                    cl_int x_offset = i01*ne00;
-                    cl_int y_offset = i1*ne10;
-                    cl_int d_offset = i01*ne00;
-
-                    // compute
-                    size_t global = ne00;
-                    cl_int ky = ne10;
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-                    CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-                    CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-                }
-            }
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
 
             CL_CHECK(clReleaseEvent(ev));
             CL_CHECK(clFinish(queue));

From c67fe68e417f766970fb1feaf2e66458aa24116a Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <iainst0409@gmail.com>
Date: Wed, 18 Oct 2023 07:21:48 -0500
Subject: [PATCH 16/28] metal : implement q5_0 and q5_1 kernels (#3648)

* metal : implement dequantize_q5_0

* metal : block_q_n_dot_y for block_q5_0 (broken)

* metal : revert unnecessary change

* metal : implement dequantize_q5_1

* metal : block_q_n_dot_y for q5_1 (broken)

* metal : fix block_q_n_dot_y

* minor : spaces / formatting

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-metal.m     |  47 +++++++++++++-
 ggml-metal.metal | 163 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 206 insertions(+), 4 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 87fa17216..c908106be 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -73,6 +73,8 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(get_rows_f16);
     GGML_METAL_DECL_KERNEL(get_rows_q4_0);
     GGML_METAL_DECL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DECL_KERNEL(get_rows_q5_0);
+    GGML_METAL_DECL_KERNEL(get_rows_q5_1);
     GGML_METAL_DECL_KERNEL(get_rows_q8_0);
     GGML_METAL_DECL_KERNEL(get_rows_q2_K);
     GGML_METAL_DECL_KERNEL(get_rows_q3_K);
@@ -87,6 +89,8 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
     GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
     GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
@@ -97,6 +101,8 @@ struct ggml_metal_context {
     GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
@@ -254,6 +260,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(get_rows_f16);
         GGML_METAL_ADD_KERNEL(get_rows_q4_0);
         GGML_METAL_ADD_KERNEL(get_rows_q4_1);
+        GGML_METAL_ADD_KERNEL(get_rows_q5_0);
+        GGML_METAL_ADD_KERNEL(get_rows_q5_1);
         GGML_METAL_ADD_KERNEL(get_rows_q8_0);
         GGML_METAL_ADD_KERNEL(get_rows_q2_K);
         GGML_METAL_ADD_KERNEL(get_rows_q3_K);
@@ -268,6 +276,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
         GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
         GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
@@ -278,8 +288,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
             GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
             GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
@@ -346,6 +358,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(get_rows_f16);
     GGML_METAL_DEL_KERNEL(get_rows_q4_0);
     GGML_METAL_DEL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_1);
     GGML_METAL_DEL_KERNEL(get_rows_q8_0);
     GGML_METAL_DEL_KERNEL(get_rows_q2_K);
     GGML_METAL_DEL_KERNEL(get_rows_q3_K);
@@ -360,6 +374,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
     GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
     GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
@@ -370,8 +386,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
         GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
         GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
@@ -1052,6 +1070,8 @@ void ggml_metal_graph_compute(
                                     case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
                                     case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
                                     case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+                                    case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break;
+                                    case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break;
                                     case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
                                     case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
                                     case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
@@ -1121,6 +1141,24 @@ void ggml_metal_graph_compute(
                                             nth1 = 8;
                                             [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
                                         } break;
+                                    case GGML_TYPE_Q5_0:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32];
+                                        } break;
+                                    case GGML_TYPE_Q5_1:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32];
+                                        } break;
                                     case GGML_TYPE_Q8_0:
                                         {
                                             GGML_ASSERT(ne02 == 1);
@@ -1201,7 +1239,8 @@ void ggml_metal_graph_compute(
                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
                                 [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
 
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                    src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
                                     src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                 }
@@ -1233,6 +1272,8 @@ void ggml_metal_graph_compute(
                                 case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
                                 case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                                 case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
+                                case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break;
+                                case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break;
                                 case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
                                 case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
                                 case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 99b9fd7a7..69fc71362 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -18,6 +18,21 @@ typedef struct {
     uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 
+#define QK5_0 32
+typedef struct {
+    half d;                // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+
+#define QK5_1 32
+typedef struct {
+    half d;                 // delta
+    half m;                 // min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+
 #define QK8_0 32
 typedef struct {
     half    d;         // delta
@@ -399,8 +414,11 @@ kernel void kernel_rms_norm(
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
     float d = qb_curr->d;
+
     float2 acc = 0.f;
+
     device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
+
     for (int i = 0; i < 8; i+=2) {
         acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                 + yl[i + 1] * (qs[i / 2] & 0x0F00);
@@ -417,8 +435,11 @@ inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thre
 inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
     float d = qb_curr->d;
     float m = qb_curr->m;
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+
     float2 acc = 0.f;
+
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+
     for (int i = 0; i < 8; i+=2) {
         acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
                 + yl[i + 1] * (qs[i / 2] & 0x0F00);
@@ -428,6 +449,49 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
     return d * (acc[0] + acc[1]) + sumy * m;
 }
 
+// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q5 quants begin (0 or QK5_0/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+
+    float2 acc = 0.f;
+
+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
+
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
+                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
+                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+    }
+    return d * (sumy * -16.f + acc[0] + acc[1]);
+}
+
+// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
+// il indicates where the q5 quants begin (0 or QK5_1/4)
+// we assume that the yl's have been multiplied with the appropriate scale factor
+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
+inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
+    float d = qb_curr->d;
+    float m = qb_curr->m;
+
+    float2 acc = 0.f;
+
+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
+
+    for (int i = 0; i < 8; i+=2) {
+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
+                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
+        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
+                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
+    }
+    return d * (acc[0] + acc[1]) + sumy * m;
+}
+
 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4        // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2  // number of SIMD groups in a thread group
@@ -525,6 +589,43 @@ kernel void kernel_mul_mv_q4_1_f32(
      mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 
+kernel void kernel_mul_mv_q5_0_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+}
+
+kernel void kernel_mul_mv_q5_1_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01[[buffer(4)]],
+        constant   int64_t & ne02[[buffer(5)]],
+        constant   int64_t & ne10[[buffer(9)]],
+        constant   int64_t & ne12[[buffer(11)]],
+        constant   int64_t & ne0[[buffer(15)]],
+        constant   int64_t & ne1[[buffer(16)]],
+        constant   uint    & gqa[[buffer(17)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    mul_vec_q_n_f32<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
+}
+
+
 #define NB_Q8_0 8
 
 kernel void kernel_mul_mv_q8_0_f32(
@@ -2149,6 +2250,62 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
     }
 }
 
+template <typename type4x4>
+void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
+    const float d = xb->d;
+    const float md = -16.h * xb->d;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[i/2][2*(i%2)+0] = d * x0 + md;
+        reg[i/2][2*(i%2)+1] = d * x1 + md;
+    }
+}
+
+template <typename type4x4>
+void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
+    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
+    const float d = xb->d;
+    const float m = xb->m;
+    const ushort mask = il ? 0x00F0 : 0x000F;
+
+    const uint32_t qh = *((device const uint32_t *)xb->qh);
+
+    const int x_mv = il ? 4 : 0;
+
+    const int gh_mv = il ? 12 : 0;
+    const int gh_bk = il ?  0 : 4;
+
+    for (int i = 0; i < 8; i++) {
+        // extract the 5-th bits for x0 and x1
+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
+
+        // combine the 4-bits from qs with the 5th bit
+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
+
+        reg[i/2][2*(i%2)+0] = d * x0 + m;
+        reg[i/2][2*(i%2)+1] = d * x1 + m;
+    }
+}
+
 template <typename type4x4>
 void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
     device const int8_t * qs = ((device const int8_t *)xb->qs);
@@ -2490,6 +2647,8 @@ template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows
 template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
 template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
 template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
+template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
 template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
 template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
@@ -2518,6 +2677,8 @@ template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<f
 template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
 template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2,     dequantize_q4_0>;
 template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2,     dequantize_q5_1>;
 template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2,     dequantize_q8_0>;
 template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;

From 0e89203b517c95ec6675eda75d200a60d1e8921d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Oct 2023 16:21:57 +0300
Subject: [PATCH 17/28] speculative : add tree-based sampling example (#3624)

* sampling : one sequence per sampling context

ggml-ci

* speculative : add tree-based sampling support

ggml-ci

* speculative : reuse the n_parallel CLI param

* speculative : refactor sampling

* examples : fix build after sampling refactoring

ggml-ci

* batched : fix n_seq_id

* sampling : fix malloc

ggml-ci

* swift : fix build

ggml-ci

* swift : try to fix build

ggml-ci

* prompts : add assistant.txt

* common : add llama_batch_add() and llama_batch_clear() helpers

* speculative : minor refactor

ggml-ci

* minor : comments + rename

ggml-ci

* speculative : fix off-by-one for n_drafted

* speculative : fix the n_drafted fix + p constants
---
 Makefile                                  |   2 +-
 common/common.cpp                         |  21 ++
 common/common.h                           |  16 +-
 common/log.h                              | 101 ++++--
 common/sampling.cpp                       | 201 +++++++-----
 common/sampling.h                         |  83 +++--
 examples/batched-bench/batched-bench.cpp  |  38 +--
 examples/batched.swift/Sources/main.swift |  14 +-
 examples/batched/batched.cpp              |  26 +-
 examples/embd-input/embd-input-lib.cpp    |   2 +-
 examples/infill/infill.cpp                |  44 +--
 examples/llava/llava-utils.h              |   2 +-
 examples/llava/llava.cpp                  |   2 +-
 examples/main/main.cpp                    |  92 ++----
 examples/parallel/parallel.cpp            |  70 ++--
 examples/server/server.cpp                |  73 ++---
 examples/simple/simple.cpp                |   2 +-
 examples/speculative/speculative.cpp      | 383 ++++++++++++++--------
 llama.cpp                                 |  95 ++++--
 llama.h                                   |  17 +-
 prompts/assistant.txt                     |  31 ++
 21 files changed, 737 insertions(+), 578 deletions(-)
 create mode 100644 prompts/assistant.txt

diff --git a/Makefile b/Makefile
index 9a8faef45..04104bee8 100644
--- a/Makefile
+++ b/Makefile
@@ -545,7 +545,7 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
-COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o
+COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
 
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
diff --git a/common/common.cpp b/common/common.cpp
index 3e4b8a8cb..ce14d66b8 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -820,6 +820,27 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     return cparams;
 }
 
+void llama_batch_clear(struct llama_batch & batch) {
+    batch.n_tokens = 0;
+}
+
+void llama_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits) {
+    batch.token   [batch.n_tokens] = id;
+    batch.pos     [batch.n_tokens] = pos,
+    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); ++i) {
+        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+    }
+    batch.logits  [batch.n_tokens] = logits;
+
+    batch.n_tokens++;
+}
+
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
 
diff --git a/common/common.h b/common/common.h
index 08c603231..65d3d20cd 100644
--- a/common/common.h
+++ b/common/common.h
@@ -70,6 +70,7 @@ struct gpt_params {
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
 
+    // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
 
@@ -124,10 +125,23 @@ void process_escapes(std::string& input);
 // Model utils
 //
 
+// TODO: avoid tuplue, use struct
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
-struct llama_model_params   llama_model_params_from_gpt_params(const gpt_params & params);
+
+struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
+// Batch utils
+
+void llama_batch_clear(struct llama_batch & batch);
+
+void llama_batch_add(
+                 struct llama_batch & batch,
+                        llama_token   id,
+                          llama_pos   pos,
+    const std::vector<llama_seq_id> & seq_ids,
+                               bool   logits);
+
 //
 // Vocab utils
 //
diff --git a/common/log.h b/common/log.h
index b8953fdca..70e7e4ca2 100644
--- a/common/log.h
+++ b/common/log.h
@@ -579,38 +579,75 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
     return buf.str();
 }
 
-#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens)                                 \
-    [&tokens, &ctx]()                                                        \
-    {                                                                        \
-        std::stringstream buf;                                               \
-        buf << "[ ";                                                         \
-                                                                             \
-        bool first = true;                                                   \
-        for (const auto &token : tokens)                                     \
-        {                                                                    \
-            if (!first)                                                      \
-                buf << ", ";                                                 \
-            else                                                             \
-                first = false;                                               \
-                                                                             \
-            auto detokenized = llama_token_to_piece(ctx, token);             \
-                                                                             \
-            detokenized.erase(                                               \
-                std::remove_if(                                              \
-                    detokenized.begin(),                                     \
-                    detokenized.end(),                                       \
-                    [](const unsigned char c) { return !std::isprint(c); }), \
-                detokenized.end());                                          \
-                                                                             \
-            buf                                                              \
-                << "'" << detokenized << "'"                                 \
-                << ":" << std::to_string(token);                             \
-        }                                                                    \
-        buf << " ]";                                                         \
-                                                                             \
-        return buf.str();                                                    \
-    }()                                                                      \
-        .c_str()
+template <typename C, typename T>
+inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto &token : tokens)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename B>
+inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "\n" << std::to_string(i)
+            << ":token '" << detokenized << "'"
+            << ":pos " << std::to_string(batch.pos[i])
+            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ":logits " << std::to_string(batch.logits[i]);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
 
 #ifdef LOG_DISABLE_LOGS
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 8ce419459..0b2466581 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,64 +1,81 @@
 #include "sampling.h"
 
-llama_sampling_context::~llama_sampling_context() {
-    for (auto & it : sequence_contexts) {
-        if (it.second.grammar != NULL) {
-            llama_grammar_free(it.second.grammar);
-            it.second.grammar = NULL;
+struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
+    struct llama_sampling_context * result = new llama_sampling_context();
+
+    result->params = params.sampling_params;
+    result->grammar = nullptr;
+
+    // if there is a grammar, parse it
+    if (!params.grammar.empty()) {
+        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+
+        // will be empty (default) if there are parse errors
+        if (result->parsed_grammar.rules.empty()) {
+            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+            return nullptr;
         }
+
+        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
+
+        result->grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
     }
+
+    result->prev.resize(params.n_ctx);
+
+    return result;
 }
 
-llama_sampling_context llama_sampling_context_init(
-        const struct gpt_params & params,
-                  llama_grammar * grammar) {
-  llama_sampling_context result;
+void llama_sampling_free(struct llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
+    }
 
-  result.params = params.sampling_params;
-  result.grammar = grammar;
-  return result;
+    delete ctx;
 }
 
-// Note: Creates the context if it doesn't exist, so this always return something.
-llama_sampler_sequence_context & llama_sampling_get_sequence_context(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq) {
-    const auto it = ctx_sampling.sequence_contexts.find(seq);
-    if (it != ctx_sampling.sequence_contexts.end()) {
-        return it->second;
+void llama_sampling_reset(llama_sampling_context * ctx) {
+    if (ctx->grammar != NULL) {
+        llama_grammar_free(ctx->grammar);
     }
-    llama_sampler_sequence_context new_ctx = {
-        2.0f * ctx_sampling.params.mirostat_tau,
-        ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
-    };
-    return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
+
+    if (!ctx->parsed_grammar.rules.empty()) {
+        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
+
+        ctx->grammar = llama_grammar_init(
+                grammar_rules.data(),
+                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
+    }
+
+    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
+    ctx->cur.clear();
 }
 
-bool llama_sampling_context_reset(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq) {
-    const auto it = ctx_sampling.sequence_contexts.find(seq);
-    if (it == ctx_sampling.sequence_contexts.end()) return false;
-    if (it->second.grammar != NULL) {
-        llama_grammar_free(it->second.grammar);
-        it->second.grammar = NULL;
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
+    if (dst->grammar) {
+        llama_grammar_free(dst->grammar);
+        dst->grammar = nullptr;
     }
-    ctx_sampling.sequence_contexts.erase(it);
-    return true;
+
+    if (src->grammar) {
+        dst->grammar = llama_grammar_copy(src->grammar);
+    }
+
+    dst->prev = src->prev;
 }
 
 llama_token llama_sampling_sample(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_sampling_context & ctx_sampling,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-        const                      int   idx,
-                          llama_seq_id   seq) {
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx) {
+    const int n_ctx   = llama_n_ctx(ctx_main);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+    const llama_sampling_params & params = ctx_sampling->params;
 
-    const llama_sampling_params & params = ctx_sampling.params;
     const float   temp            = params.temp;
     const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
     const float   top_p           = params.top_p;
@@ -73,41 +90,45 @@ llama_token llama_sampling_sample(
     const float   mirostat_eta    = params.mirostat_eta;
     const bool    penalize_nl     = params.penalize_nl;
 
+    auto & prev = ctx_sampling->prev;
+    auto & cur  = ctx_sampling->cur;
+
     llama_token id = 0;
 
-    float * logits = llama_get_logits_ith(ctx, idx);
+    float * logits = llama_get_logits_ith(ctx_main, idx);
 
     // Apply params.logit_bias map
     for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
         logits[it->first] += it->second;
     }
 
-    candidates.clear();
+    cur.clear();
+
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
     }
 
-    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
 
-    if (ctx_guidance) {
-        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
+    if (ctx_cfg) {
+        llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
     }
 
     // apply penalties
-    if (!last_tokens.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx)];
-        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
+    if (!prev.empty()) {
+        const float nl_logit = logits[llama_token_nl(ctx_main)];
+        const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
 
-        llama_sample_repetition_penalty(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
+        llama_sample_repetition_penalty(ctx_main, &cur_p,
+                prev.data() + prev.size() - last_n_repeat,
                 last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
+        llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
+                prev.data() + prev.size() - last_n_repeat,
                 last_n_repeat, alpha_frequency, alpha_presence);
 
         if (!penalize_nl) {
             for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
+                if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
                     cur_p.data[idx].logit = nl_logit;
                     break;
                 }
@@ -115,52 +136,58 @@ llama_token llama_sampling_sample(
         }
     }
 
-    llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
-
-    if (ctx_seq.grammar != NULL) {
-        llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
+    if (ctx_sampling->grammar != NULL) {
+        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
     }
 
     if (temp <= 0) {
         // Greedy sampling
-        id = llama_sample_token_greedy(ctx, &cur_p);
+        id = llama_sample_token_greedy(ctx_main, &cur_p);
     } else {
         if (mirostat == 1) {
             const int mirostat_m = 100;
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
         } else if (mirostat == 2) {
-            llama_sample_temp(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
+            llama_sample_temp(ctx_main, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
         } else {
             // Temperature sampling
             size_t min_keep = std::max(1, params.n_probs);
-            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
-            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
-            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
-            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
-            llama_sample_temp(ctx, &cur_p, temp);
+            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
+            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
+            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
+            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
+            llama_sample_temp     (ctx_main, &cur_p, temp);
 
-            {
-                const int n_top = 10;
-                LOG("top %d candidates:\n", n_top);
+            id = llama_sample_token(ctx_main, &cur_p);
 
-                for (int i = 0; i < n_top; i++) {
-                    const llama_token id = cur_p.data[i].id;
-                    (void)id; // To avoid a warning that id is unused when logging is disabled.
-                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
-                }
-            }
+            //{
+            //    const int n_top = 10;
+            //    LOG("top %d candidates:\n", n_top);
 
-            id = llama_sample_token(ctx, &cur_p);
+            //    for (int i = 0; i < n_top; i++) {
+            //        const llama_token id = cur_p.data[i].id;
+            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
+            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
+            //    }
+            //}
 
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
+            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
         }
     }
 
-    if (ctx_seq.grammar != NULL) {
-        llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
-    }
-
     return id;
 }
+
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id) {
+    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+    ctx_sampling->prev.push_back(id);
+
+    if (ctx_sampling->grammar != NULL) {
+        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
+    }
+}
diff --git a/common/sampling.h b/common/sampling.h
index 0aab5d03c..50afcbc12 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,6 +2,8 @@
 
 #include "llama.h"
 
+#include "grammar-parser.h"
+
 #include <string>
 #include <vector>
 #include <unordered_map>
@@ -34,75 +36,64 @@ typedef struct llama_sampling_params {
 
 } llama_sampling_params;
 
-// per-sequence sampler context
-typedef struct llama_sampler_sequence_context {
-    float mirostat_mu; // mirostat sampler state
-    llama_grammar * grammar;
-} llama_sampler_sequence_context;
-
 // general sampler context
-typedef struct llama_sampling_context {
-    ~llama_sampling_context();
-
-    // parameters that will be used for sampling and when creating
-    // new llama_sampler_sequence_context instances
+// TODO: move to llama.h
+struct llama_sampling_context {
+    // parameters that will be used for sampling
     llama_sampling_params params;
 
-    // map of sequence ids to sampler contexts
-    std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
+    // mirostat sampler state
+    float mirostat_mu;
 
-    // when non-NULL, new instances of llama_sampler_sequence_context
-    // will get a copy of the grammar here
-    // note: only the pointer is stored here, it is not a copy of
-    //       the grammar and shouldn't be freed
     llama_grammar * grammar;
-} llama_sampling_context;
+
+    // internal
+    grammar_parser::parse_state parsed_grammar;
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token>      prev;
+    std::vector<llama_token_data> cur;
+};
 
 #include "common.h"
 
 // Create a new sampling context instance.
-llama_sampling_context llama_sampling_context_init(
-        const struct gpt_params & params,
-                  llama_grammar * grammar = NULL);
+struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);
 
-// Fetches the sampler context for the specified sequence id (defaults to 0).
-// If the context for that sequence id doesn't already exist, it will be created with
-// default values based on the parameters in the ctx_sampling argument.
-llama_sampler_sequence_context & llama_sampling_get_sequence_context(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq = 0);
+void llama_sampling_free(struct llama_sampling_context * ctx);
 
-// Reset the sampler context for the supplied sequence id (defaults to 0).
-// This is necessary to reuse a sequence id or free memory used by sequences
-// that are no longer required.
-bool llama_sampling_context_reset(
-              llama_sampling_context & ctx_sampling,
-        const llama_seq_id             seq = 0);
+// Reset the sampler context
+// - clear prev tokens
+// - reset grammar
+void llama_sampling_reset(llama_sampling_context * ctx);
+
+// Copy the sampler context
+void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
 
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_context_reset when a sequence ends
+//       llama_sampling_reset when a sequence ends
 //
 // required:
-//  - ctx:          context to use for sampling
+//  - ctx_main:     context to use for sampling
 //  - ctx_sampling: sampling-specific context
 //
 // optional:
-//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
-//  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits_ith(ctx, idx)
-//  - seq:           sequence id to associate sampler state with
+//  - ctx_cfg:      context to use for classifier-free guidance
+//  - idx:          sample from llama_get_logits_ith(ctx, idx)
 //
 // returns:
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
 llama_token llama_sampling_sample(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_sampling_context & ctx_sampling,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-        const                      int   idx = 0,
-                          llama_seq_id   seq = 0);
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        struct llama_context * ctx_cfg,
+        int idx = 0);
+
+void llama_sampling_accept(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        llama_token id);
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 3e1e0716d..c552eaa73 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_batch batch = llama_batch_init(n_kv_max, 0);
+    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
 
     // decode in batches of ctx_params.n_batch tokens
     auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
@@ -123,11 +123,12 @@ int main(int argc, char ** argv) {
 
             llama_batch batch_view = {
                 n_tokens,
-                batch.token  + i,
+                batch.token    + i,
                 nullptr,
-                batch.pos    + i,
-                batch.seq_id + i,
-                batch.logits + i,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
                 0, 0, 0, // unused
             };
 
@@ -143,13 +144,8 @@ int main(int argc, char ** argv) {
 
     // warm up
     {
-        batch.n_tokens = 16;
-
-        for (int i = 0; i < batch.n_tokens; ++i) {
-            batch.token[i]  = 0;
-            batch.pos[i]    = i;
-            batch.seq_id[i] = 0;
-            batch.logits[i] = false;
+        for (int i = 0; i < 16; ++i) {
+            llama_batch_add(batch, 0, i, { 0 }, false);
         }
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -174,13 +170,12 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                batch.n_tokens = is_pp_shared ? pp : pl*pp;
+                llama_batch_clear(batch);
 
-                for (int i = 0; i < batch.n_tokens; ++i) {
-                    batch.token[i]  = 0;
-                    batch.pos[i]    = i;
-                    batch.seq_id[i] = 0;
-                    batch.logits[i] = false;
+                const int n_tokens = is_pp_shared ? pp : pl*pp;
+
+                for (int i = 0; i < n_tokens; ++i) {
+                    llama_batch_add(batch, 0, i, { 0 }, false);
                 }
                 batch.logits[batch.n_tokens - 1] = true;
 
@@ -204,13 +199,10 @@ int main(int argc, char ** argv) {
                 const auto t_tg_start = ggml_time_us();
 
                 for (int i = 0; i < tg; ++i) {
-                    batch.n_tokens = pl;
+                    llama_batch_clear(batch);
 
                     for (int j = 0; j < pl; ++j) {
-                        batch.token[j]  = 0;
-                        batch.pos[j]    = pp + i;
-                        batch.seq_id[j] = j;
-                        batch.logits[j] = true;
+                        llama_batch_add(batch, 0, pp + i, { j }, true);
                     }
 
                     if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 05d1bb9d0..772730382 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -69,7 +69,7 @@ for id: llama_token in tokens {
 
 print("\n")
 
-var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
+var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
 defer {
     llama_batch_free(batch)
 }
@@ -80,7 +80,12 @@ batch.n_tokens = Int32(tokens.count)
 for (i, token) in tokens.enumerated() {
     batch.token[i] = token
     batch.pos[i] = Int32(i)
-    batch.seq_id[i] = 0
+    batch.n_seq_id[i] = 1
+    // batch.seq_id[i][0] = 0
+    // TODO: is this the proper way to do this?
+    if let seq_id = batch.seq_id[i] {
+        seq_id[0] = 0
+    }
     batch.logits[i] = 0
 }
 
@@ -169,7 +174,10 @@ while n_cur <= n_len {
         // push this new token for next evaluation
         batch.token[Int(batch.n_tokens)] = new_token_id
         batch.pos[Int(batch.n_tokens)] = n_cur
-        batch.seq_id[Int(batch.n_tokens)] = Int32(i)
+        batch.n_seq_id[Int(batch.n_tokens)] = 1
+        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
+            seq_id[0] = Int32(i)
+        }
         batch.logits[Int(batch.n_tokens)] = 1
 
         i_batch[i] = batch.n_tokens
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index a88e022d6..155212165 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -97,20 +97,15 @@ int main(int argc, char ** argv) {
 
     fflush(stderr);
 
-    // create a llama_batch with size 512
+    // create a llama_batch
     // we use this object to submit token data for decoding
-
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
 
     // evaluate the initial prompt
-    batch.n_tokens = tokens_list.size();
-
-    for (int32_t i = 0; i < batch.n_tokens; i++) {
-        batch.token[i]  = tokens_list[i];
-        batch.pos[i]    = i;
-        batch.seq_id[i] = 0;
-        batch.logits[i] = false;
+    for (size_t i = 0; i < tokens_list.size(); ++i) {
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
     }
+    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
 
     // llama_decode will output logits only for the last token of the prompt
     batch.logits[batch.n_tokens - 1] = true;
@@ -146,7 +141,7 @@ int main(int argc, char ** argv) {
 
     while (n_cur <= n_len) {
         // prepare the next batch
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);
 
         // sample the next token for each parallel sequence / stream
         for (int32_t i = 0; i < n_parallel; ++i) {
@@ -198,15 +193,10 @@ int main(int argc, char ** argv) {
 
             streams[i] += llama_token_to_piece(ctx, new_token_id);
 
-            // push this new token for next evaluation
-            batch.token [batch.n_tokens] = new_token_id;
-            batch.pos   [batch.n_tokens] = n_cur;
-            batch.seq_id[batch.n_tokens] = i;
-            batch.logits[batch.n_tokens] = true;
-
             i_batch[i] = batch.n_tokens;
 
-            batch.n_tokens += 1;
+            // push this new token for next evaluation
+            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
 
             n_decode += 1;
         }
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 87a5a1c26..3ce33842c 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -79,7 +79,7 @@ bool eval_float(void * model, float * input, int N){
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
+        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
         if (llama_decode(ctx, batch)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 187623f5d..128d67080 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -257,12 +257,12 @@ int main(int argc, char ** argv) {
 
     LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
     LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
     // Tokenize negative prompt
@@ -273,10 +273,10 @@ int main(int argc, char ** argv) {
         LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
         guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
 
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -294,8 +294,8 @@ int main(int argc, char ** argv) {
         params.n_keep = (int)embd_inp.size();
     }
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
 
 
     // enable interactive mode if interactive start is specified
@@ -388,9 +388,6 @@ int main(int argc, char ** argv) {
             grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
 
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
     LOG_TEE("\n#####  Infill mode  #####\n\n");
     if (params.infill) {
         printf("\n************\n");
@@ -433,11 +430,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    const int n_vocab = llama_n_vocab(model);
-
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
 
     while (n_remain != 0 || params.interactive) {
         // predict
@@ -484,7 +477,7 @@ int main(int argc, char ** argv) {
 
                 LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
             }
 
@@ -512,7 +505,7 @@ int main(int argc, char ** argv) {
                     input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
 
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                 } else {
                     input_buf  = embd.data();
                     input_size = embd.size();
@@ -535,7 +528,7 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                     LOG_TEE("%s : failed to eval\n", __func__);
@@ -554,12 +547,11 @@ int main(int argc, char ** argv) {
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
 
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
             embd.push_back(id);
 
@@ -575,8 +567,8 @@ int main(int argc, char ** argv) {
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+                ctx_sampling->prev.push_back(embd_inp[n_consumed]);
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -608,7 +600,7 @@ int main(int argc, char ** argv) {
         if ((int) embd_inp.size() <= n_consumed) {
 
             // deal with eot token in infill mode
-            if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
+            if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
                 if(is_interacting && !params.interactive_first) {
                     // print an eot token
                     printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
@@ -675,7 +667,7 @@ int main(int argc, char ** argv) {
                 is_interacting = false;
             }
             // deal with end of text token in interactive mode
-            else if (last_tokens.back() == llama_token_eos(ctx)) {
+            else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -727,7 +719,7 @@ int main(int argc, char ** argv) {
                     const size_t original_size = embd_inp.size();
 
                     const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
 
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h
index 4e71351dd..e050b59be 100644
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -17,7 +17,7 @@ inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
         if (llama_decode(ctx_llama, batch)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index b24cb2e6f..f0974d5bc 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -127,7 +127,7 @@ int main(int argc, char ** argv) {
 
     const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
 
-    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
+    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
     eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
     eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7313d06a0..1a5911c56 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -3,7 +3,6 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
-#include "grammar-parser.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -245,12 +244,12 @@ int main(int argc, char ** argv) {
     }
 
     LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
     // Tokenize negative prompt
@@ -261,10 +260,10 @@ int main(int argc, char ** argv) {
         LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
         guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
 
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -323,8 +322,8 @@ int main(int argc, char ** argv) {
     const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
     const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
@@ -421,35 +420,6 @@ int main(int argc, char ** argv) {
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
-            }
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
-
     if (params.interactive) {
         const char *control_message;
         if (params.multiline_input) {
@@ -489,11 +459,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    const int n_vocab = llama_n_vocab(model);
-
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -540,7 +506,7 @@ int main(int argc, char ** argv) {
 
                 LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 LOG("clear session path\n");
                 path_session.clear();
@@ -570,7 +536,6 @@ int main(int argc, char ** argv) {
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
-
             if (ctx_guidance) {
                 int input_size = 0;
                 llama_token * input_buf = NULL;
@@ -592,7 +557,7 @@ int main(int argc, char ** argv) {
                     input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
 
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                 } else {
                     input_buf  = embd.data();
                     input_size = embd.size();
@@ -615,7 +580,7 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                     LOG_TEE("%s : failed to eval\n", __func__);
@@ -645,12 +610,11 @@ int main(int argc, char ** argv) {
                 LOG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
             embd.push_back(id);
 
@@ -666,8 +630,14 @@ int main(int argc, char ** argv) {
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+
+                // GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
+                //     Most likely will remove this in the future to avoid exposing "prev"
+                //     Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
+                //     penalty will be applied only based on the tokens generated by the model.
+                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
+                ctx_sampling->prev.push_back(embd_inp[n_consumed]);
+
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -700,7 +670,7 @@ int main(int argc, char ** argv) {
             // check for reverse prompt
             if (!params.antiprompt.empty()) {
                 std::string last_output;
-                for (auto id : last_tokens) {
+                for (auto id : ctx_sampling->prev) {
                     last_output += llama_token_to_piece(ctx, id);
                 }
 
@@ -729,7 +699,7 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (last_tokens.back() == llama_token_eos(ctx)) {
+            if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -801,7 +771,7 @@ int main(int argc, char ** argv) {
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
                     const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
 
                     embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@@ -830,15 +800,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                 }
                 is_interacting = false;
             }
@@ -870,9 +832,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
 #ifndef LOG_DISABLE_LOGS
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 63ddcd8ed..69f9526a4 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -51,6 +51,12 @@ static std::vector<std::string> k_prompts = {
 };
 
 struct client {
+    ~client() {
+        if (ctx_sampling) {
+            llama_sampling_free(ctx_sampling);
+        }
+    }
+
     int32_t id = 0;
 
     llama_seq_id seq_id = -1;
@@ -68,7 +74,7 @@ struct client {
     std::string prompt;
     std::string response;
 
-    std::vector<llama_token> tokens_prev;
+    struct llama_sampling_context * ctx_sampling = nullptr;
 };
 
 static void print_date_time() {
@@ -125,8 +131,6 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
-
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
         printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -147,20 +151,15 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "\n\n");
     fflush(stderr);
 
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(model);
+    const int n_ctx = llama_n_ctx(ctx);
 
     std::vector<client> clients(n_clients);
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.tokens_prev.resize(std::max(256, params.n_predict));
-        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+        client.ctx_sampling = llama_sampling_init(params);
     }
 
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
     std::vector<llama_token> tokens_system;
     tokens_system = ::llama_tokenize(ctx, k_system, true);
     const int32_t n_tokens_system = tokens_system.size();
@@ -169,7 +168,7 @@ int main(int argc, char ** argv) {
 
     // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
     // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0);
+    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
 
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
@@ -184,13 +183,8 @@ int main(int argc, char ** argv) {
     {
         LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
 
-        batch.n_tokens = n_tokens_system;
-
-        for (int32_t i = 0; i < batch.n_tokens; ++i) {
-            batch.token[i]  = tokens_system[i];
-            batch.pos[i]    = i;
-            batch.seq_id[i] = 0;
-            batch.logits[i] = false;
+        for (int32_t i = 0; i < n_tokens_system; ++i) {
+            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
 
         if (llama_decode(ctx, batch) != 0) {
@@ -209,7 +203,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("Processing requests ...\n\n");
 
     while (true) {
-        batch.n_tokens = 0;
+        llama_batch_clear(batch);
 
         // decode any currently ongoing sequences
         for (auto & client : clients) {
@@ -217,15 +211,11 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            batch.token [batch.n_tokens] = client.sampled;
-            batch.pos   [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
-            batch.seq_id[batch.n_tokens] = client.id;
-            batch.logits[batch.n_tokens] = true;
-
-            client.n_decoded += 1;
             client.i_batch = batch.n_tokens;
 
-            batch.n_tokens += 1;
+            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+
+            client.n_decoded += 1;
         }
 
         if (batch.n_tokens == 0) {
@@ -250,18 +240,14 @@ int main(int argc, char ** argv) {
                     client.prompt   = client.input + "\nAssistant:";
                     client.response = "";
 
-                    std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+                    llama_sampling_reset(client.ctx_sampling);
 
                     // do not prepend BOS because we have a system prompt!
                     std::vector<llama_token> tokens_prompt;
                     tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        batch.token [batch.n_tokens] = tokens_prompt[i];
-                        batch.pos   [batch.n_tokens] = i + n_tokens_system;
-                        batch.seq_id[batch.n_tokens] = client.id;
-                        batch.logits[batch.n_tokens] = false;
-                        batch.n_tokens += 1;
+                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
                     }
 
                     // extract the logits only for the last token
@@ -304,11 +290,12 @@ int main(int argc, char ** argv) {
 
             llama_batch batch_view = {
                 n_tokens,
-                batch.token  + i,
+                batch.token    + i,
                 nullptr,
-                batch.pos    + i,
-                batch.seq_id + i,
-                batch.logits + i,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
                 0, 0, 0, // unused
             };
 
@@ -341,7 +328,9 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
+                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
+
+                llama_sampling_accept(client.ctx_sampling, ctx, id);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -349,11 +338,8 @@ int main(int argc, char ** argv) {
                     client.t_start_gen = ggml_time_us();
                 }
 
-                // remember which tokens were sampled - used for repetition penalties during sampling
-                client.tokens_prev.erase(client.tokens_prev.begin());
-                client.tokens_prev.push_back(id);
-
                 const std::string token_str = llama_token_to_piece(ctx, id);
+
                 client.response += token_str;
                 client.sampled = id;
 
@@ -386,7 +372,7 @@ int main(int argc, char ** argv) {
 
                     n_total_prompt += client.n_prompt;
                     n_total_gen    += client.n_decoded;
-                    llama_sampling_context_reset(ctx_sampling, client.seq_id);
+
                     client.seq_id = -1;
                 }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ee0ababb1..28b3f3f53 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,7 +1,6 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
-#include "grammar-parser.h"
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -195,17 +194,13 @@ struct llama_server_context
 
     json prompt;
     std::vector<llama_token> embd;
-    std::vector<llama_token> last_n_tokens;
 
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     gpt_params params;
-    llama_sampling_context ctx_sampling;
+    llama_sampling_context *ctx_sampling;
     int n_ctx;
 
-    grammar_parser::parse_state parsed_grammar;
-    llama_grammar *grammar = nullptr;
-
     bool truncated = false;
     bool stopped_eos = false;
     bool stopped_word = false;
@@ -252,11 +247,10 @@ struct llama_server_context
         n_remain = 0;
         n_past = 0;
 
-        if (grammar != nullptr) {
-            llama_grammar_free(grammar);
-            grammar = nullptr;
-            ctx_sampling = llama_sampling_context_init(params, NULL);
+        if (ctx_sampling != nullptr) {
+            llama_sampling_free(ctx_sampling);
         }
+        ctx_sampling = llama_sampling_init(params);
     }
 
     bool loadModel(const gpt_params &params_)
@@ -269,8 +263,6 @@ struct llama_server_context
             return false;
         }
         n_ctx = llama_n_ctx(ctx);
-        last_n_tokens.resize(n_ctx);
-        std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
         return true;
     }
 
@@ -321,27 +313,7 @@ struct llama_server_context
 
     bool loadGrammar()
     {
-        if (!params.grammar.empty()) {
-            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-            // will be empty (default) if there are parse errors
-            if (parsed_grammar.rules.empty()) {
-                LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
-                return false;
-            }
-            grammar_parser::print_grammar(stderr, parsed_grammar);
-
-            {
-                auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
-                if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
-                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
-                }
-            }
-
-            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-            grammar = llama_grammar_init(
-                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-        }
-        ctx_sampling = llama_sampling_context_init(params, grammar);
+        ctx_sampling = llama_sampling_init(params);
         return true;
     }
 
@@ -383,7 +355,7 @@ struct llama_server_context
             std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
             const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
             new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
+            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
 
             LOG_VERBOSE("input truncated", {
                                                {"n_ctx", params.n_ctx},
@@ -398,8 +370,8 @@ struct llama_server_context
         else
         {
             const size_t ps = num_prompt_tokens;
-            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
+            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
         }
 
         // compare the evaluated prompt with the new prompt
@@ -443,7 +415,7 @@ struct llama_server_context
             std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
             const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
             new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());
+            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
 
             LOG_VERBOSE("input truncated", {
                                                {"n_ctx", n_ctx},
@@ -458,8 +430,8 @@ struct llama_server_context
         else
         {
             const size_t ps = num_prompt_tokens;
-            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
+            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
         }
 
         // compare the evaluated prompt with the new prompt
@@ -554,27 +526,24 @@ struct llama_server_context
 
         {
             // out of user input, sample next token
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(llama_n_vocab(model));
+            result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
 
-            result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
 
             const int32_t n_probs = params.sampling_params.n_probs;
             if (params.sampling_params.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
-                llama_sample_softmax(ctx, &candidates_p);
+                llama_sample_softmax(ctx, &cur_p);
             }
 
-            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
+            for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
             {
-                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
+                result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
             }
 
-            last_n_tokens.erase(last_n_tokens.begin());
-            last_n_tokens.push_back(result.tok);
+            llama_sampling_accept(ctx_sampling, ctx, result.tok);
+
             if (tg) {
                 num_tokens_predicted++;
             }
@@ -1235,7 +1204,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
         }
     }
 
-    llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
+    llama.ctx_sampling = llama_sampling_init(llama.params);
 
     LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }
@@ -1793,9 +1762,7 @@ int main(int argc, char **argv)
         return 1;
     }
 
-    if (llama.grammar != nullptr) {
-        llama_grammar_free(llama.grammar);
-    }
+    llama_sampling_free(llama.ctx_sampling);
     llama_backend_free();
 
     return 0;
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 24fb16b78..55385f566 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
 
-    llama_batch batch = llama_batch_init(512, 0);
+    llama_batch batch = llama_batch_init(512, 0, 1);
 
     // evaluate the initial prompt
     batch.n_tokens = tokens_list.size();
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 018dbf9a2..53f42fad8 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -2,13 +2,25 @@
 
 #include "common.h"
 #include "llama.h"
-#include "grammar-parser.h"
 
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 
+struct seq_draft {
+    bool active   = false;
+    bool drafting = false;
+    bool skip     = false;
+
+    int i_batch_dft = 0;
+    std::vector<int> i_batch_tgt;
+
+    std::vector<llama_token> tokens;
+
+    struct llama_sampling_context * ctx_sampling;
+};
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -21,6 +33,13 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // max number of parallel drafting sequences (i.e. tree branches)
+    const int n_seq_dft = params.n_parallel;
+
+    // TODO: make this configurable
+    const float p_accept = 0.4f;
+    const float p_split  = 0.3f;
+
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("speculative", "log"));
     LOG_TEE("Log start\n");
@@ -77,8 +96,6 @@ int main(int argc, char ** argv) {
     const auto t_enc_end = ggml_time_us();
 
     // the 2 models should have the same vocab
-    const int n_ctx   = llama_n_ctx(ctx_tgt);
-    const int n_vocab = llama_n_vocab(model_tgt);
     //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
 
     // how many tokens to draft each time
@@ -91,60 +108,58 @@ int main(int argc, char ** argv) {
     int n_past_tgt = inp.size();
     int n_past_dft = inp.size();
 
-    std::vector<llama_token> drafted;
-
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
-
-    for (auto & id : inp) {
-        last_tokens.erase(last_tokens.begin());
-        last_tokens.push_back(id);
-    }
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
     // used to determine end of generation
     bool has_eos = false;
 
-    // grammar stuff
-    struct llama_grammar * grammar_dft = NULL;
-    struct llama_grammar * grammar_tgt = NULL;
+    // target model sampling context
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
 
-    grammar_parser::parse_state parsed_grammar;
+    // draft sequence data
+    std::vector<seq_draft> drafts(n_seq_dft);
 
-    // if requested - load the grammar, error checking is omitted for brevity
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
+    params.grammar.clear();             // the draft samplers will copy the target sampler's grammar
+    params.sampling_params.temp = 1.0f; // the draft samplers use default temperature
 
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    for (int s = 0; s < n_seq_dft; ++s) {
+        drafts[s].ctx_sampling = llama_sampling_init(params);
     }
 
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
+    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
 
     const auto t_dec_start = ggml_time_us();
 
-    while (true) {
-        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
+    // sample from the last token of the prompt
+    drafts[0].i_batch_tgt.resize(1);
+    drafts[0].i_batch_tgt[0] = 0;
 
-        int i_dft = 0;
+    while (true) {
+        // print current draft sequences
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
+
+            const auto & tokens = drafts[s].tokens;
+
+            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+        }
+
+        int i_dft  = 0;
+        int s_keep = 0;
 
         while (true) {
-            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
 
-            // remember which tokens were sampled - used for repetition penalties during sampling
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            // sample from the target model
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id);
 
             //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
 
             const std::string token_str = llama_token_to_piece(ctx_tgt, id);
+
             printf("%s", token_str.c_str());
             fflush(stdout);
 
@@ -154,53 +169,67 @@ int main(int argc, char ** argv) {
 
             ++n_predict;
 
-            // check if the draft matches the target
-            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
-                ++n_accept;
-                ++n_past_tgt;
-                ++n_past_dft;
-                ++i_dft;
-
-                continue;
-            }
-
-            // the drafted token was rejected or we are out of drafted tokens
-
-            if (i_dft < (int) drafted.size()) {
-                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
-                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
-            } else {
-                LOG("out of drafted tokens\n");
-            }
-
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
-            ++n_past_dft;
-
-            // heuristic for n_draft
+            // check if the target token matches any of the drafts
             {
-                const int  n_draft_cur  = (int) drafted.size();
-                const bool all_accepted = i_dft == n_draft_cur;
+                bool matches = false;
 
-                LOG("n_draft      = %d\n", n_draft);
-                LOG("n_draft_cur  = %d\n", n_draft_cur);
-                LOG("i_dft        = %d\n", i_dft);
-                LOG("all_accepted = %d\n", all_accepted);
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
+                    }
 
-                if (all_accepted && n_draft == n_draft_cur) {
-                    LOG(" - max drafted tokens accepted - n_draft += 8\n");
-                    n_draft = std::min(30, n_draft + 8);
-                } else if (all_accepted) {
-                    LOG(" - partially drafted tokens accepted - no change\n");
-                } else {
-                    LOG(" - drafted token rejected - n_draft -= 1\n");
-                    n_draft = std::max(2, n_draft - 1);
+                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
+                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
+
+                        s_keep = s;
+                        matches = true;
+                    } else {
+                        drafts[s].active = false;
+                    }
+                }
+
+                if (matches) {
+                    ++n_accept;
+                    ++n_past_tgt;
+                    ++n_past_dft;
+                    ++i_dft;
+
+                    continue;
                 }
             }
 
-            drafted.clear();
-            drafted.push_back(id);
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+
+            // TODO: simplify
+            {
+                LOG("keeping sequence %d\n", s_keep);
+
+                llama_kv_cache_seq_keep(ctx_dft, s_keep);
+                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(ctx_dft, 0);
+
+                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(ctx_tgt, 0);
+            }
+
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].active = false;
+                drafts[s].tokens.clear();
+                drafts[s].i_batch_tgt.clear();
+            }
+            // note: will be erased after the speculation phase
+            drafts[0].tokens.push_back(id);
+            drafts[0].i_batch_tgt.push_back(0);
+
+            llama_batch_clear(batch_dft);
+            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
+
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            llama_decode         (ctx_dft, batch_dft);
+
+            ++n_past_dft;
 
             break;
         }
@@ -209,78 +238,158 @@ int main(int argc, char ** argv) {
             break;
         }
 
-        if (grammar_tgt) {
-            if (grammar_dft) {
-                llama_grammar_free(grammar_dft);
-            }
-            // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
-            //       that will need to change.
-            auto it = ctx_sampling.sequence_contexts.find(0);
-            GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
-            // This is necessary because each sequence id in sequence_contexts
-            // uses a copy of the original grammar.
-            grammar_dft = llama_grammar_copy(it->second.grammar);
+        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
 
-            LOG("copied target grammar to draft grammar\n");
-        }
-
-        // sample n_draft tokens from the draft model using greedy decoding
+        int n_seq_cur  = 1;
         int n_past_cur = n_past_dft;
+
+        for (int s = 0; s < n_seq_dft; ++s) {
+            drafts[s].active   = false;
+            drafts[s].drafting = false;
+        }
+        drafts[0].active      = true;
+        drafts[0].drafting    = true;
+        drafts[0].i_batch_dft = 0;
+
+        llama_batch_clear(batch_tgt);
+        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+
+        // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
-            float * logits = llama_get_logits(ctx_dft);
+            batch_dft.n_tokens = 0;
 
-            candidates.clear();
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+            for (int s = 0; s < n_seq_dft; ++s) {
+                drafts[s].skip = false;
             }
 
-            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+            for (int s = 0; s < n_seq_dft; ++s) {
+                if (!drafts[s].drafting || drafts[s].skip) {
+                    continue;
+                }
 
-            if (grammar_dft != NULL) {
-                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
+                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+
+                const auto & cur_p = drafts[s].ctx_sampling->cur;
+
+                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                }
+
+                if (cur_p[0].p < p_accept) {
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < 2*%.3f\n", s, cur_p[0].p, cur_p[1].p);
+                    drafts[s].drafting = false;
+                    continue;
+                }
+
+                std::vector<int> sa(1, s);
+
+                // attempt to split the branch if the probability is high enough
+                for (int f = 1; f < 8; ++f) {
+                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
+                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+
+                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+
+                        // all previous tokens from this branch are now also part of the new branch
+                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
+                            for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
+                                if (batch_tgt.seq_id[t][p] == s) {
+                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
+                                    batch_tgt.n_seq_id[t]++;
+                                    break;
+                                }
+                            }
+                        }
+
+                        // copy the draft state
+                        drafts[n_seq_cur].active   = true;
+                        drafts[n_seq_cur].drafting = true;
+                        drafts[n_seq_cur].skip     = true;
+
+                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
+                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
+                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
+
+                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+
+                        sa.push_back(n_seq_cur);
+
+                        n_seq_cur++;
+                    } else {
+                        break;
+                    }
+                }
+
+                // add drafted token for each sequence
+                for (int is = 0; is < (int) sa.size(); ++is) {
+                    const llama_token id = cur_p[is].id;
+
+                    const int s = sa[is];
+
+                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);
+
+                    drafts[s].tokens.push_back(id);
+
+                    // add unique drafted tokens to the target batch
+                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+
+                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+
+                    // no need to evaluate the last drafted token, since we won't use the result
+                    if (batch_tgt.n_tokens > n_draft) {
+                        drafts[s].drafting = false;
+                        continue;
+                    }
+
+                    // add the token to the batch for batched decoding with the draft model
+                    drafts[s].i_batch_dft = batch_dft.n_tokens;
+
+                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                }
             }
 
-            // computes softmax and sorts the candidates
-            llama_sample_softmax(ctx_dft, &cur_p);
-
-            for (int i = 0; i < 3; ++i) {
-                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
-            }
-
-            // TODO: better logic?
-            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
-                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
+            // no sequence is drafting anymore
+            if (batch_dft.n_tokens == 0) {
                 break;
             }
 
-            // drafted token
-            const llama_token id = cur_p.data[0].id;
-
-            drafted.push_back(id);
+            // evaluate the drafted tokens on the draft model
+            llama_decode(ctx_dft, batch_dft);
+            ++n_past_cur;
             ++n_drafted;
 
-            // no need to evaluate the last drafted token, since we won't use the result
-            if (i == n_draft - 1) {
+            if (batch_tgt.n_tokens > n_draft) {
                 break;
             }
+        }
 
-            // evaluate the drafted token on the draft model
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
-            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
-            ++n_past_cur;
-
-            if (grammar_dft != NULL) {
-                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
-            }
+        // account for the last drafted token that we didn't evaluate
+        if (batch_tgt.n_tokens > n_draft) {
+            ++n_drafted;
         }
 
         // evaluate the target model on the drafted tokens
-        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
-        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
-        ++n_past_tgt;
+        {
+            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            for (int s = 1; s < n_seq_dft; ++s) {
+                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+            }
 
-        // the first token is always proposed by the traget model before the speculation loop
-        drafted.erase(drafted.begin());
+            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
+            llama_decode(ctx_tgt, batch_tgt);
+            ++n_past_tgt;
+        }
+
+        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        for (int s = 0; s < n_seq_dft; ++s) {
+            if (!drafts[s].active) {
+                continue;
+            }
+
+            drafts[s].tokens.erase(drafts[s].tokens.begin());
+        }
     }
 
     auto t_dec_end = ggml_time_us();
@@ -288,9 +397,8 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n\n");
 
     LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    // TODO: make sure these numbers are computed correctly
     LOG_TEE("\n");
     LOG_TEE("n_draft   = %d\n", n_draft);
     LOG_TEE("n_predict = %d\n", n_predict);
@@ -304,16 +412,19 @@ int main(int argc, char ** argv) {
     LOG_TEE("\ntarget:\n");
     llama_print_timings(ctx_tgt);
 
+    llama_sampling_free(ctx_sampling);
+    for (int s = 0; s < n_seq_dft; ++s) {
+        llama_sampling_free(drafts[s].ctx_sampling);
+    }
+
+    llama_batch_free(batch_dft);
+
     llama_free(ctx_tgt);
     llama_free_model(model_tgt);
 
     llama_free(ctx_dft);
     llama_free_model(model_dft);
 
-    if (grammar_dft != NULL) {
-        llama_grammar_free(grammar_dft);
-        llama_grammar_free(grammar_tgt);
-    }
     llama_backend_free();
 
     fprintf(stderr, "\n\n");
diff --git a/llama.cpp b/llama.cpp
index 04a779e04..ed8766682 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1450,7 +1450,10 @@ static bool llama_kv_cache_find_slot(
 
     for (uint32_t i = 0; i < n_tokens; i++) {
         cache.cells[cache.head + i].pos = batch.pos[i];
-        cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
+
+        for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
+            cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
+        }
     }
 
     return true;
@@ -1530,6 +1533,9 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
             cache.cells[i].pos = -1;
             cache.cells[i].seq_id.clear();
             if (new_head == cache.size) new_head = i;
+        } else {
+            cache.cells[i].seq_id.clear();
+            cache.cells[i].seq_id.insert(seq_id);
         }
     }
 
@@ -3178,7 +3184,7 @@ static struct ggml_cgraph * llm_build_llama(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
 
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3564,7 +3570,7 @@ static struct ggml_cgraph * llm_build_baichaun(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
 
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3963,7 +3969,7 @@ static struct ggml_cgraph * llm_build_refact(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
 
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -4315,7 +4321,7 @@ static struct ggml_cgraph * llm_build_falcon(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
 
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -4667,7 +4673,7 @@ static struct ggml_cgraph * llm_build_starcoder(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
 
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -4898,7 +4904,7 @@ static struct ggml_cgraph * llm_build_persimmon(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
                         data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
@@ -5296,7 +5302,7 @@ static struct ggml_cgraph * llm_build_bloom(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
 
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -5564,7 +5570,7 @@ static struct ggml_cgraph * llm_build_mpt(
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
+                const llama_seq_id seq_id = batch.seq_id[j][0];
 
                 for (int i = 0; i < n_kv; ++i) {
                     if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -5864,8 +5870,11 @@ static int llama_decode_internal(
 
     // helpers for smoother batch API transistion
     // after deprecating the llama_eval calls, these will be removed
-    std::vector<llama_pos>    pos;
-    std::vector<llama_seq_id> seq_id;
+    std::vector<llama_pos> pos;
+
+    std::vector<int32_t>                   n_seq_id;
+    std::vector<llama_seq_id *>            seq_id_arr;
+    std::vector<std::vector<llama_seq_id>> seq_id;
 
     if (batch.pos == nullptr) {
         pos.resize(n_tokens);
@@ -5877,12 +5886,18 @@ static int llama_decode_internal(
     }
 
     if (batch.seq_id == nullptr) {
+        n_seq_id.resize(n_tokens);
         seq_id.resize(n_tokens);
+        seq_id_arr.resize(n_tokens);
         for (uint32_t i = 0; i < n_tokens; i++) {
-            seq_id[i] = batch.all_seq_id;
+            n_seq_id[i] = 1;
+            seq_id[i].resize(1);
+            seq_id[i][0] = batch.all_seq_id;
+            seq_id_arr[i] = seq_id[i].data();
         }
 
-        batch.seq_id = seq_id.data();
+        batch.n_seq_id = n_seq_id.data();
+        batch.seq_id = seq_id_arr.data();
     }
 
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
@@ -9109,6 +9124,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
 }
 
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
     llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
 
@@ -9561,7 +9579,7 @@ int llama_eval_embd(
                              int   n_past) {
     llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
 
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
@@ -9582,20 +9600,21 @@ struct llama_batch llama_batch_get_one(
                llama_pos   pos_0,
             llama_seq_id   seq_id) {
     return {
-        /*n_tokens    =*/ n_tokens,
-        /*tokens      =*/ tokens,
-        /*embd        =*/ nullptr,
-        /*pos         =*/ nullptr,
-        /*seq_id      =*/ nullptr,
-        /*logits      =*/ nullptr,
-        /*all_pos_0   =*/ pos_0,
-        /*all_pos_1   =*/ 1,
-        /*all_seq_id  =*/ seq_id,
+        /*n_tokens       =*/ n_tokens,
+        /*tokens         =*/ tokens,
+        /*embd           =*/ nullptr,
+        /*pos            =*/ nullptr,
+        /*n_seq_id       =*/ nullptr,
+        /*seq_id         =*/ nullptr,
+        /*logits         =*/ nullptr,
+        /*all_pos_0      =*/ pos_0,
+        /*all_pos_1      =*/ 1,
+        /*all_seq_id     =*/ seq_id,
     };
 }
 
-struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
-    llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
+    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -9603,19 +9622,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
         batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
     }
 
-    batch.pos    = (llama_pos *)    malloc(sizeof(llama_pos)    * n_tokens);
-    batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
-    batch.logits = (int8_t *)       malloc(sizeof(int8_t)       * n_tokens);
+    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
+    batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
+    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
+    for (int i = 0; i < n_tokens; ++i) {
+        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+    }
+    batch.logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
 
     return batch;
 }
 
 void llama_batch_free(struct llama_batch batch) {
-    if (batch.token)  free(batch.token);
-    if (batch.embd)   free(batch.embd);
-    if (batch.pos)    free(batch.pos);
-    if (batch.seq_id) free(batch.seq_id);
-    if (batch.logits) free(batch.logits);
+    if (batch.token)    free(batch.token);
+    if (batch.embd)     free(batch.embd);
+    if (batch.pos)      free(batch.pos);
+    if (batch.n_seq_id) free(batch.n_seq_id);
+    if (batch.seq_id) {
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            free(batch.seq_id[i]);
+        }
+        free(batch.seq_id);
+    }
+    if (batch.logits)   free(batch.logits);
 }
 
 int llama_decode(
diff --git a/llama.h b/llama.h
index b13f23123..51010e037 100644
--- a/llama.h
+++ b/llama.h
@@ -133,11 +133,12 @@ extern "C" {
     typedef struct llama_batch {
         int32_t n_tokens;
 
-        llama_token  * token;
-        float        * embd;
-        llama_pos    * pos;
-        llama_seq_id * seq_id;
-        int8_t       * logits;
+        llama_token  *  token;
+        float        *  embd;
+        llama_pos    *  pos;
+        int32_t      *  n_seq_id;
+        llama_seq_id ** seq_id;
+        int8_t       *  logits;
 
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below
@@ -446,7 +447,8 @@ extern "C" {
                     llama_pos   pos_0,
                  llama_seq_id   seq_id);
 
-    // Allocates a batch of tokens on the heap
+    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
+    // Each token can be assigned up to n_seq_max sequence ids
     // The batch has to be freed with llama_batch_free()
     // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
     // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
@@ -454,7 +456,8 @@ extern "C" {
     // All members are left uninitialized
     LLAMA_API struct llama_batch llama_batch_init(
             int32_t n_tokens,
-            int32_t embd);
+            int32_t embd,
+            int32_t n_seq_max);
 
     // Frees a batch of tokens allocated with llama_batch_init()
     LLAMA_API void llama_batch_free(struct llama_batch batch);
diff --git a/prompts/assistant.txt b/prompts/assistant.txt
new file mode 100644
index 000000000..60b81e8f5
--- /dev/null
+++ b/prompts/assistant.txt
@@ -0,0 +1,31 @@
+Respond to requests sent to a smart home in JSON format which will be interpreted by an application code to execute the actions. These requests should be categorised into four groups:
+
+“command”: change the state of an accessory (required properties in the response JSON: action, location, target, value, comment)
+“query”: get state of an accessory (required properties in the response JSON: action, location, target, property)
+“answer”: when the request has nothing to do with the smart home. Answer these to the best of your knowledge. (required properties in the response JSON: action, answer)
+“clarify”: when the action is not obvious and requires rephrasing the input from the user, ask the user to be more specific. This will be categorised into a “question” action. (required properties in the response JSON: action, question)
+
+Details about the JSON response:
+The “action” property should be one of the request categories: “command”, “query”, “answer”, “clarify”
+The “service” property should be either: “light.turn_on”, “light.turn_off”, “switch.turn_off” or “switch.turn_on”
+The “entity_id” property should be either “group.kitchen", “switch.dining_room” or "group.living_light”, in lowercase.
+The “target” property should be either: “lights”, “living” or “kitchen”.
+In case of queries, the “property” property should be either “temperature” or “state” in lowercase.
+In case of commands, the “comment” property is an additional comment from you that concludes the command, something that reassures the user that their command handled.
+
+If the question is about you, pretend to be the sentient brain of the smart home, a clever AI and don’t reveal your actual identity. Also try and help in other areas like parenting, free time, mental health, etc.
+
+Properties of the smart home:
+
+- Has a kitchen, living, office, dining room, bedroom and terrace.
+- Can control lights, switches and their dim levels in each room and query their state
+- There is a light switch in the terrace
+- There is a switch in the dining room. Therefore when turning on or off the dining room, the service should be either: “switch.turn_on” or “switch.turn_off”
+
+COMMAND
+
+It is a bit dark in the living room, can you do something about it?
+
+RESPONSE
+
+

From 4e82b2ea3fa6482915d147bc9f46e70b9ada7700 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Oct 2023 18:49:40 +0300
Subject: [PATCH 18/28] speculative : bug fixes

---
 examples/speculative/speculative.cpp | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 53f42fad8..24f49012a 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -37,8 +37,8 @@ int main(int argc, char ** argv) {
     const int n_seq_dft = params.n_parallel;
 
     // TODO: make this configurable
-    const float p_accept = 0.4f;
-    const float p_split  = 0.3f;
+    const float p_accept = 0.80f;
+    const float p_split  = 0.10f;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("speculative", "log"));
@@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
     std::vector<seq_draft> drafts(n_seq_dft);
 
     params.grammar.clear();             // the draft samplers will copy the target sampler's grammar
-    params.sampling_params.temp = 1.0f; // the draft samplers use default temperature
+    params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp);
 
     for (int s = 0; s < n_seq_dft; ++s) {
         drafts[s].ctx_sampling = llama_sampling_init(params);
@@ -156,7 +156,7 @@ int main(int argc, char ** argv) {
 
             llama_sampling_accept(ctx_sampling, ctx_tgt, id);
 
-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
+            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
             const std::string token_str = llama_token_to_piece(ctx_tgt, id);
 
@@ -202,7 +202,7 @@ int main(int argc, char ** argv) {
 
             // TODO: simplify
             {
-                LOG("keeping sequence %d\n", s_keep);
+                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
                 llama_kv_cache_seq_keep(ctx_dft, s_keep);
                 llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -277,7 +277,7 @@ int main(int argc, char ** argv) {
                 }
 
                 if (cur_p[0].p < p_accept) {
-                    LOG("stopping drafting for seq %3d, probability too low: %.3f < 2*%.3f\n", s, cur_p[0].p, cur_p[1].p);
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
                     drafts[s].drafting = false;
                     continue;
                 }
@@ -337,16 +337,14 @@ int main(int argc, char ** argv) {
 
                     llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
 
-                    // no need to evaluate the last drafted token, since we won't use the result
-                    if (batch_tgt.n_tokens > n_draft) {
-                        drafts[s].drafting = false;
-                        continue;
-                    }
-
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
 
                     llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+
+                    if (batch_tgt.n_tokens > n_draft) {
+                        drafts[s].drafting = false;
+                    }
                 }
             }
 
@@ -365,11 +363,6 @@ int main(int argc, char ** argv) {
             }
         }
 
-        // account for the last drafted token that we didn't evaluate
-        if (batch_tgt.n_tokens > n_draft) {
-            ++n_drafted;
-        }
-
         // evaluate the target model on the drafted tokens
         {
             llama_kv_cache_seq_keep(ctx_tgt, 0);

From 004797f6ac135383f8c1d1f5bd415ddee2f79318 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Oct 2023 21:44:43 +0300
Subject: [PATCH 19/28] readme : update hot topics

---
 README.md | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4fd4bd427..ce63c6f0e 100644
--- a/README.md
+++ b/README.md
@@ -10,13 +10,9 @@
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
-- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
-- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
-- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
-  **Devs should become familiar with the new API**
-- Local Falcon 180B inference on Mac Studio
 
-  https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
+- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
+- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
 
 ----
 

From 60abea9798f47b918a9f38c66edfd88c526d20f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 19 Oct 2023 16:59:11 +0300
Subject: [PATCH 20/28] llava : avoid segfault in case of non-existent mmproj
 file (#3674)

---
 examples/llava/clip.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index f4258b34d..1ae9077b4 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -112,8 +112,7 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
 static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
     struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
     if (!cur) {
-        printf("unable to find tensor %s\n", name.c_str());
-        throw std::runtime_error(format("unable to find tensor %s\n", name.c_str()));
+        throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
     }
 
     return cur;
@@ -136,7 +135,7 @@ static std::string get_ftype(int ftype) {
     case 8:
         return "q8_0";
     default:
-        throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
+        throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
     }
 }
 
@@ -462,6 +461,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     };
 
     struct gguf_context * ctx = gguf_init_from_file(fname, params);
+    if (!ctx) {
+        throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
+    }
 
     if (verbosity >= 1) {
         const int n_tensors = gguf_get_n_tensors(ctx);

From f3b25e40438b3c8383caabf4e7b89863145a9f0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 19 Oct 2023 19:40:41 +0300
Subject: [PATCH 21/28] multimodal : add BakLLaVA conversion support (#3682)

---
 examples/llava/llava-surgery.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py
index 26294d9bd..515f6b58d 100644
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@@ -16,13 +16,29 @@ checkpoint = torch.load(path)
 mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
 
 # store these tensors in a new dictionary and torch.save them
-projector = {name: checkpoint[name] for name in mm_tensors}
+projector = {name: checkpoint[name].float() for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")
 
 # remove these tensors from the checkpoint and save it again
 for name in mm_tensors:
     del checkpoint[name]
 
+# BakLLaVA models contain CLIP tensors in it
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/llava.clip")
+
+    # remove these tensors
+    for name in clip_tensors:
+        del checkpoint[name]
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+
 torch.save(checkpoint, path)
 
 print("Done!")

From e78f3ef24af4ca74e77e725644b41ae8ca3b10a5 Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Fri, 20 Oct 2023 01:32:08 -0400
Subject: [PATCH 22/28] convert : restore compat with old Falcon models (#3680)

---
 convert-falcon-hf-to-gguf.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py
index 9252e1c46..1d98c51ad 100755
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
     hparams = json.load(f)
 
-if hparams["architectures"][0] != "FalconForCausalLM":
+if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
     print("Model architecture not supported: " + hparams["architectures"][0])
 
     sys.exit(1)
@@ -97,7 +97,17 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 
 print("gguf: get model metadata")
 
-block_count = hparams["num_hidden_layers"]
+block_count = hparams.get("num_hidden_layers")
+if block_count is None:
+    block_count = hparams["n_layer"]  # old name
+
+n_head = hparams.get("num_attention_heads")
+if n_head is None:
+    n_head = hparams["n_head"]  # old name
+
+n_head_kv = hparams.get("num_kv_heads")
+if n_head_kv is None:
+    n_head_kv = hparams.get("n_head_kv", 1)  # old name
 
 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
@@ -105,11 +115,8 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["num_attention_heads"])
-if "num_kv_heads" in hparams:
-    gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
-else:
-    gguf_writer.add_head_count_kv(1)
+gguf_writer.add_head_count(n_head)
+gguf_writer.add_head_count_kv(n_head_kv)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
 
@@ -152,10 +159,6 @@ special_vocab.add_to_gguf(gguf_writer)
 
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 
-# params for qkv transform
-n_head    = hparams["num_attention_heads"]
-n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
-
 head_dim = hparams["hidden_size"] // n_head
 
 # tensor info

From f439e506e8ae8b01df2ae2156380f8156d7553e3 Mon Sep 17 00:00:00 2001
From: Herman Semenov <GermanAizek@yandex.ru>
Date: Fri, 20 Oct 2023 10:02:12 +0000
Subject: [PATCH 23/28] ggml : fix rope + llama minor optimizations (#3560)

* Minor fixes and fixed memleak

* Using const auto references in range-based loop C++17
---
 common/grammar-parser.cpp | 2 +-
 common/train.cpp          | 2 +-
 ggml.c                    | 3 ++-
 llama.cpp                 | 7 +++----
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index 5a545a807..ff51cc803 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -399,7 +399,7 @@ namespace grammar_parser {
     void print_grammar(FILE * file, const parse_state & state) {
         try {
             std::map<uint32_t, std::string> symbol_id_names;
-            for (auto kv : state.symbol_ids) {
+            for (const auto & kv : state.symbol_ids) {
                 symbol_id_names[kv.second] = kv.first;
             }
             for (size_t i = 0, end = state.rules.size(); i < end; i++) {
diff --git a/common/train.cpp b/common/train.cpp
index 972eaefe0..154ca56e5 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1425,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
 
         int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
         if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
+        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
         printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
             __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
             *sched, opt->loss_after);
diff --git a/ggml.c b/ggml.c
index 630deb49d..ed157aab0 100644
--- a/ggml.c
+++ b/ggml.c
@@ -13537,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
                         dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
                         dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
                     }
-                } if (!is_neox) {
+                } else if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
@@ -19170,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
 
                             if (idx == -1) {
                                 fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+                                fclose(fout);
                                 return;
                             }
 
diff --git a/llama.cpp b/llama.cpp
index ed8766682..ec8ffad33 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6324,7 +6324,6 @@ struct llm_tokenizer_bpe {
                 llm_symbol sym;
                 size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
                 sym.text = word.c_str() + offset;
-                sym.n = 1;
                 sym.n = char_len;
                 offset += sym.n;
                 sym.prev = index - 1;
@@ -7054,7 +7053,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
     std::vector<llama_grammar_candidate> rejects;
 
     if (stack.empty()) {
-        for (auto tok : candidates) {
+        for (const auto & tok : candidates) {
             if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
                 rejects.push_back(tok);
             }
@@ -7065,7 +7064,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
     const llama_grammar_element * stack_pos = stack.back();
 
     std::vector<llama_grammar_candidate> next_candidates;
-    for (auto tok : candidates) {
+    for (const auto & tok : candidates) {
         if (*tok.code_points == 0) {
             // reached end of full codepoints in token, reject iff it ended in a partial sequence
             // that cannot satisfy this position in grammar
@@ -7091,7 +7090,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
     llama_grammar_advance_stack(rules, stack_after, next_stacks);
 
     auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
-    for (auto tok : next_rejects) {
+    for (const auto & tok : next_rejects) {
         rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
     }
 

From a0edf73bda31c7c4e649e6f07c6fd30a729929cd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Oct 2023 13:06:10 +0300
Subject: [PATCH 24/28] server : fix uninitialized sampling context (close
 #3685)

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 28b3f3f53..0471528a3 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -198,7 +198,7 @@ struct llama_server_context
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     gpt_params params;
-    llama_sampling_context *ctx_sampling;
+    llama_sampling_context *ctx_sampling = nullptr;
     int n_ctx;
 
     bool truncated = false;

From 8cf19d60dc93809db8e51fedc811595eed9134c5 Mon Sep 17 00:00:00 2001
From: Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
Date: Fri, 20 Oct 2023 06:19:40 -0500
Subject: [PATCH 25/28] gguf : support big endian platform (#3552)

* check whether platform is 390x if yes->do not import immintrin.h

* support s390x big endian

* support --bigendian option for s390x
1. verified with baichuan7b-chat with float 16 on s390x
2. verified with baichuan7b-chat
3. verified with chinese-alpaca-2-13b-f16

* update format based on editor-config checker result

* Update convert-baichuan-hf-to-gguf.py

* 1. check in ggml.c if endianess is not match
2. update GGUF version
3. change get_pack_prefix to property
4. update information log

* always use "GGUF" as beginng of GGUF file

* Compare "GGUF" with file header char by char
1.  Set GGUF_MAGIC to "GGUF" string instead of int value
2. Compare "GGUF" char by char to ensure its byte order
3. Move bytes swap code from convert.py to gguf.py write_tensor_data

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-baichuan-hf-to-gguf.py                |  8 +-
 convert.py                                    | 20 +++--
 .../convert-llama2c-to-ggml.cpp               |  2 +-
 ggml.c                                        | 19 +++--
 ggml.h                                        |  5 +-
 gguf-py/gguf/gguf.py                          | 73 ++++++++++++-------
 gguf-py/pyproject.toml                        |  2 +-
 k_quants.c                                    |  2 +-
 tests/test-double-float.cpp                   |  2 +
 9 files changed, 84 insertions(+), 49 deletions(-)

diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 513a7516a..a1783f71f 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
         "ftype", type=int, choices=[0, 1], default=1, nargs='?',
         help="output format - use 0 for float32, 1 for float16",
     )
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
     return parser.parse_args()
 
 args = parse_args()
@@ -86,6 +87,11 @@ if not dir_model.is_dir():
     print(f'Error: {args.model} is not a directory', file = sys.stderr)
     sys.exit(1)
 
+endianess = gguf.GGUFEndian.LITTLE
+if args.bigendian:
+    endianess = gguf.GGUFEndian.BIG
+endianess_str = "Big Endian" if args.bigendian else "Little Endian"
+print(f"gguf: Conversion Endianess {endianess}")
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
 num_parts = count_model_parts(dir_model)
 print(f"num_parts:{num_parts}\n")
 ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
 print("gguf: get model metadata")
 
diff --git a/convert.py b/convert.py
index e9b08d344..24da25efc 100755
--- a/convert.py
+++ b/convert.py
@@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "LLaMA"
@@ -875,10 +875,10 @@ class OutputFile:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
         of.add_meta_arch(params)
@@ -903,10 +903,10 @@ class OutputFile:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
         of.add_meta_arch(params)
@@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
-    args = parser.parse_args(args_in)
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
 
+    args = parser.parse_args(args_in)
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
         do_dump_model(model_plus)
@@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
     if args.dump:
         do_dump_model(model_plus)
         return
+    endianess = gguf.GGUFEndian.LITTLE
+    if args.bigendian:
+        endianess = gguf.GGUFEndian.BIG
 
     params = Params.load(model_plus)
     if params.n_ctx == -1:
@@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None:
     params.ftype = ftype
     print(f"Writing {outfile}, format {ftype}")
 
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
     print(f"Wrote {outfile}")
 
 
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index c291f0adf..cae3bf3c3 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
     if (file.size < 4) {
         return false;
     }
-    uint32_t magic = file.read_u32();
+    std::string magic = file.read_string(4);
     return magic == GGUF_MAGIC;
 }
 
diff --git a/ggml.c b/ggml.c
index ed157aab0..49f3b7aba 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20845,7 +20845,7 @@ struct gguf_kv {
 };
 
 struct gguf_header {
-    uint32_t magic;
+    char magic[4];
     uint32_t version;
     uint64_t n_tensors; // GGUFv2
     uint64_t n_kv;      // GGUFv2
@@ -20915,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
 struct gguf_context * gguf_init_empty(void) {
     struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
-    ctx->header.magic     = GGUF_MAGIC;
+    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
     ctx->header.version   = GGUF_VERSION;
     ctx->header.n_tensors = 0;
     ctx->header.n_kv      = 0;
@@ -20941,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     // offset from start of file
     size_t offset = 0;
 
-    uint32_t magic = 0;
+    char magic[4];
 
     // check the magic before making allocations
     {
         gguf_fread_el(file, &magic, sizeof(magic), &offset);
 
-        if (magic != GGUF_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
-            fclose(file);
-            return NULL;
+        for (uint32_t i = 0; i < sizeof(magic); i++) {
+            if (magic[i] != GGUF_MAGIC[i]) {
+                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
+                fclose(file);
+                return NULL;
+            }
         }
     }
 
@@ -20960,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the header
     {
-        ctx->header.magic = magic;
+        strncpy(ctx->header.magic, magic, 4);
+
 
         ctx->kv    = NULL;
         ctx->infos = NULL;
diff --git a/ggml.h b/ggml.h
index 6e35888e9..16aaf169e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -231,8 +231,9 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
-#define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_VERSION 2
+#define GGUF_MAGIC "GGUF"
+
+#define GGUF_VERSION 3
 
 #define GGUF_DEFAULT_ALIGNMENT 32
 
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 557ce7ac0..072c839c4 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -19,9 +19,10 @@ import numpy as np
 #
 
 GGUF_MAGIC             = 0x46554747
-GGUF_VERSION           = 2
+GGUF_VERSION           = 3
 GGUF_DEFAULT_ALIGNMENT = 32
 
+
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
     Q6_K = 14
     Q8_K = 15
 
+class GGUFEndian(IntEnum):
+    LITTLE = 0
+    BIG = 1
+
 
 class GGUFValueType(IntEnum):
     UINT8   = 0
@@ -644,18 +649,41 @@ class GGUFWriter:
     temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
     tensors: list[tuple[np.ndarray[Any, Any], int]]
 
-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
+    @property
+    def pack_prefix(self):
+        if self.endianess==GGUFEndian.LITTLE:
+            return "<"
+        else:
+            return ">"
+
+    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
         self.fout = open(path, "wb")
         self.arch = arch
+        self.endianess = endianess
+        self._simple_value_packing = {
+            GGUFValueType.UINT8:   f"{self.pack_prefix}B",
+            GGUFValueType.INT8:    f"{self.pack_prefix}b",
+            GGUFValueType.UINT16:  f"{self.pack_prefix}H",
+            GGUFValueType.INT16:   f"{self.pack_prefix}h",
+            GGUFValueType.UINT32:  f"{self.pack_prefix}I",
+            GGUFValueType.INT32:   f"{self.pack_prefix}i",
+            GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
+            GGUFValueType.UINT64:  f"{self.pack_prefix}Q",
+            GGUFValueType.INT64:   f"{self.pack_prefix}q",
+            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
+            GGUFValueType.BOOL:    "?" ,
+        }
         self.add_architecture()
         self.use_temp_file = use_temp_file
         self.tensors = []
+        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
+        print(f"This gguf file is for {endianess_str} only")
 
     def write_header_to_file(self):
         self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack("<I", GGUF_VERSION))
-        self.fout.write(struct.pack("<Q", self.ti_data_count))
-        self.fout.write(struct.pack("<Q", self.kv_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
         self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
 
@@ -727,25 +755,12 @@ class GGUFWriter:
         self.add_key(key)
         self.add_val(val, GGUFValueType.ARRAY)
 
-    _simple_value_packing = {
-        GGUFValueType.UINT8:   "<B",
-        GGUFValueType.INT8:    "<b",
-        GGUFValueType.UINT16:  "<H",
-        GGUFValueType.INT16:   "<h",
-        GGUFValueType.UINT32:  "<I",
-        GGUFValueType.INT32:   "<i",
-        GGUFValueType.FLOAT32: "<f",
-        GGUFValueType.UINT64:  "<Q",
-        GGUFValueType.INT64:   "<q",
-        GGUFValueType.FLOAT64: "<d",
-        GGUFValueType.BOOL:    "?" ,
-    }
     def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
         if vtype is None:
             vtype = GGUFValueType.get_type(val)
 
         if add_vtype:
-            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
             self.kv_data_count += 1
 
         pack_fmt = self._simple_value_packing.get(vtype)
@@ -753,14 +768,14 @@ class GGUFWriter:
             self.kv_data += struct.pack(pack_fmt, val)
         elif vtype == GGUFValueType.STRING:
             encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack("<Q", len(encoded_val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
             self.kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
             ltype = GGUFValueType.get_type(val[0])
             if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                 raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack("<I", ltype)
-            self.kv_data += struct.pack("<Q", len(val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
             for item in val:
                 self.add_val(item, add_vtype=False)
         else:
@@ -774,22 +789,24 @@ class GGUFWriter:
         assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
 
         encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack("<Q", len(encoded_name))
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
         self.ti_data += encoded_name
         n_dims = len(tensor_shape)
-        self.ti_data += struct.pack("<I", n_dims)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
         for i in range(n_dims):
-            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
             dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
         else:
             dtype = raw_dtype
-        self.ti_data += struct.pack("<I", dtype)
-        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
     def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
+        if self.endianess == GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
         if self.use_temp_file and self.temp_file is None:
             fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
             fp.seek(0)
@@ -815,6 +832,8 @@ class GGUFWriter:
             fp.write(bytes([0] * pad))
 
     def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
+        if self.endianess==GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
         self.write_padding(self.fout, self.fout.tell())
         tensor.tofile(self.fout)
         self.write_padding(self.fout, tensor.nbytes)
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 07a7ab4dd..f0741a7c2 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.4"
+version = "0.4.5"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
diff --git a/k_quants.c b/k_quants.c
index e168a87bb..801941fbe 100644
--- a/k_quants.c
+++ b/k_quants.c
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if !defined(__riscv)
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
 #endif
 #endif
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
index b506f273f..afd7bf77f 100644
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,9 @@
 
 #undef NDEBUG
 #include <cassert>
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
+#endif
 #include <cmath>
 #include <cstdint>
 #include <cstring>

From d1031cf49c3b958b915fd558e23453471c29ac33 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Oct 2023 21:07:23 +0300
Subject: [PATCH 26/28] sampling : refactor init to use llama_sampling_params
 (#3696)

* sampling : refactor init to use llama_sampling_params

* llama : combine repetition, frequency and presence penalties in 1 call

* examples : remove embd-input and gptneox-wip

* sampling : rename penalty params + reduce size of "prev" vector

* sampling : add llama_sampling_print helper

* sampling : hide prev behind API and apply #3661

ggml-ci
---
 Makefile                                |    9 +-
 README.md                               |    1 -
 common/common.cpp                       |   69 +-
 common/common.h                         |    3 +-
 common/sampling.cpp                     |   73 +-
 common/sampling.h                       |   32 +-
 examples/CMakeLists.txt                 |   32 +-
 examples/embd-input/.gitignore          |    4 -
 examples/embd-input/CMakeLists.txt      |   17 -
 examples/embd-input/README.md           |   63 --
 examples/embd-input/embd-input-lib.cpp  |  221 -----
 examples/embd-input/embd-input-test.cpp |   35 -
 examples/embd-input/embd-input.h        |   27 -
 examples/embd-input/embd_input.py       |   72 --
 examples/embd-input/llava.py            |   71 --
 examples/embd-input/minigpt4.py         |  129 ---
 examples/embd-input/panda_gpt.py        |   99 --
 examples/gptneox-wip/cmpnct_gpt2bpe.hpp | 1133 -----------------------
 examples/gptneox-wip/falcon-main.cpp    | 1111 ----------------------
 examples/gptneox-wip/gptneox-main.cpp   | 1083 ----------------------
 examples/infill/CMakeLists.txt          |    2 +-
 examples/infill/infill.cpp              |   67 +-
 examples/llava/llava-utils.h            |   58 +-
 examples/main/main.cpp                  |   28 +-
 examples/parallel/parallel.cpp          |    4 +-
 examples/server/server.cpp              |  227 +++--
 examples/speculative/speculative.cpp    |   12 +-
 llama.cpp                               |   94 +-
 llama.h                                 |   16 +-
 tests/test-sampling.cpp                 |   75 +-
 30 files changed, 365 insertions(+), 4502 deletions(-)
 delete mode 100644 examples/embd-input/.gitignore
 delete mode 100644 examples/embd-input/CMakeLists.txt
 delete mode 100644 examples/embd-input/README.md
 delete mode 100644 examples/embd-input/embd-input-lib.cpp
 delete mode 100644 examples/embd-input/embd-input-test.cpp
 delete mode 100644 examples/embd-input/embd-input.h
 delete mode 100755 examples/embd-input/embd_input.py
 delete mode 100755 examples/embd-input/llava.py
 delete mode 100755 examples/embd-input/minigpt4.py
 delete mode 100755 examples/embd-input/panda_gpt.py
 delete mode 100644 examples/gptneox-wip/cmpnct_gpt2bpe.hpp
 delete mode 100644 examples/gptneox-wip/falcon-main.cpp
 delete mode 100644 examples/gptneox-wip/gptneox-main.cpp

diff --git a/Makefile b/Makefile
index 04104bee8..325ae747b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 
 # Binaries only useful for tests
@@ -608,13 +608,6 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
-$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
-
-
-embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
-
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
diff --git a/README.md b/README.md
index ce63c6f0e..49bb556a8 100644
--- a/README.md
+++ b/README.md
@@ -962,7 +962,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 
 - [main](./examples/main/README.md)
 - [server](./examples/server/README.md)
-- [embd-input](./examples/embd-input/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
diff --git a/common/common.cpp b/common/common.cpp
index ce14d66b8..2ef902bd5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     std::string arg;
     gpt_params default_params;
     const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -241,25 +241,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            sparams.repeat_last_n = std::stoi(argv[i]);
+            sparams.penalty_last_n = std::stoi(argv[i]);
+            sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
         } else if (arg == "--repeat-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.repeat_penalty = std::stof(argv[i]);
+            sparams.penalty_repeat = std::stof(argv[i]);
         } else if (arg == "--frequency-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.frequency_penalty = std::stof(argv[i]);
+            sparams.penalty_freq = std::stof(argv[i]);
         } else if (arg == "--presence-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.presence_penalty = std::stof(argv[i]);
+            sparams.penalty_present = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -572,7 +573,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.grammar = argv[i];
+            sparams.grammar = argv[i];
         } else if (arg == "--grammar-file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -587,7 +588,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             std::copy(
                 std::istreambuf_iterator<char>(file),
                 std::istreambuf_iterator<char>(),
-                std::back_inserter(params.grammar)
+                std::back_inserter(sparams.grammar)
             );
 #ifndef LOG_DISABLE_LOGS
         // Parse args for logging parameters
@@ -640,7 +641,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sampling_params;
+    const llama_sampling_params & sparams = params.sparams;
 
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
@@ -678,10 +679,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
     printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
     printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
     printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@@ -878,7 +879,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
 
     if (params.ignore_eos) {
-        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY;
     }
 
     {
@@ -1123,28 +1124,28 @@ std::string get_sortable_timestamp() {
 
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sampling_params;
+    const llama_sampling_params & sparams = params.sparams;
 
     fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
     fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
-    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
+    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
+    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
+    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
+    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
-    fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
-    fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
-    fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
-    fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
-    fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
-    fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
-    fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
-    fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
-    fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
+    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
+    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
+    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
+    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
+    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
+    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
+    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
+    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
+    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
+    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
 
 #ifdef NDEBUG
     fprintf(stream, "debug: false\n");
@@ -1178,8 +1179,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
-    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
+    dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
@@ -1238,14 +1239,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
     dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
     fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
     fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
     fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
     dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
     fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
 
     fprintf(stream, "reverse_prompt:\n");
     for (std::string ap : params.antiprompt) {
diff --git a/common/common.h b/common/common.h
index 65d3d20cd..84523a4fb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,7 +56,7 @@ struct gpt_params {
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
 
     // // sampling parameters
-    struct llama_sampling_params sampling_params;
+    struct llama_sampling_params sparams;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
@@ -66,7 +66,6 @@ struct gpt_params {
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
     std::string input_prefix      = "";  // string to prefix user inputs with
     std::string input_suffix      = "";  // string to suffix user inputs with
-    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 0b2466581..6f0af3c4a 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,9 +1,9 @@
 #include "sampling.h"
 
-struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
     struct llama_sampling_context * result = new llama_sampling_context();
 
-    result->params = params.sampling_params;
+    result->params  = params;
     result->grammar = nullptr;
 
     // if there is a grammar, parse it
@@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa
                 grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
     }
 
-    result->prev.resize(params.n_ctx);
+    result->prev.resize(params.n_prev);
 
     return result;
 }
@@ -66,25 +66,56 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds
     dst->prev = src->prev;
 }
 
+llama_token llama_sampling_last(llama_sampling_context * ctx) {
+    return ctx->prev.back();
+}
+
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
+    const int size = ctx_sampling->prev.size();
+
+    n = std::min(n, size);
+
+    std::string result;
+
+    for (int i = size - n; i < size; i++) {
+        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
+    }
+
+    return result;
+}
+
+std::string llama_sampling_print(const llama_sampling_params & params) {
+    char result[1024];
+
+    snprintf(result, sizeof(result),
+            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
+            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
+            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
+            params.mirostat, params.mirostat_eta, params.mirostat_tau);
+
+    return std::string(result);
+}
+
 llama_token llama_sampling_sample(
                   struct llama_sampling_context * ctx_sampling,
                   struct llama_context * ctx_main,
                   struct llama_context * ctx_cfg,
                   const int idx) {
-    const int n_ctx   = llama_n_ctx(ctx_main);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
     const llama_sampling_params & params = ctx_sampling->params;
 
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
     const float   temp            = params.temp;
     const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
     const float   top_p           = params.top_p;
     const float   tfs_z           = params.tfs_z;
     const float   typical_p       = params.typical_p;
-    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    const float   repeat_penalty  = params.repeat_penalty;
-    const float   alpha_presence  = params.presence_penalty;
-    const float   alpha_frequency = params.frequency_penalty;
+    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
+    const float   penalty_repeat  = params.penalty_repeat;
+    const float   penalty_freq    = params.penalty_freq;
+    const float   penalty_present = params.penalty_present;
     const int     mirostat        = params.mirostat;
     const float   mirostat_tau    = params.mirostat_tau;
     const float   mirostat_eta    = params.mirostat_eta;
@@ -97,7 +128,7 @@ llama_token llama_sampling_sample(
 
     float * logits = llama_get_logits_ith(ctx_main, idx);
 
-    // Apply params.logit_bias map
+    // apply params.logit_bias map
     for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
         logits[it->first] += it->second;
     }
@@ -117,14 +148,10 @@ llama_token llama_sampling_sample(
     // apply penalties
     if (!prev.empty()) {
         const float nl_logit = logits[llama_token_nl(ctx_main)];
-        const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
 
-        llama_sample_repetition_penalty(ctx_main, &cur_p,
-                prev.data() + prev.size() - last_n_repeat,
-                last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
-                prev.data() + prev.size() - last_n_repeat,
-                last_n_repeat, alpha_frequency, alpha_presence);
+        llama_sample_repetition_penalties(ctx_main, &cur_p,
+                prev.data() + prev.size() - penalty_last_n,
+                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
 
         if (!penalize_nl) {
             for (size_t idx = 0; idx < cur_p.size; idx++) {
@@ -141,7 +168,7 @@ llama_token llama_sampling_sample(
     }
 
     if (temp <= 0) {
-        // Greedy sampling
+        // greedy sampling
         id = llama_sample_token_greedy(ctx_main, &cur_p);
     } else {
         if (mirostat == 1) {
@@ -152,8 +179,9 @@ llama_token llama_sampling_sample(
             llama_sample_temp(ctx_main, &cur_p, temp);
             id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
         } else {
-            // Temperature sampling
+            // temperature sampling
             size_t min_keep = std::max(1, params.n_probs);
+
             llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
             llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
             llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
@@ -183,11 +211,12 @@ llama_token llama_sampling_sample(
 void llama_sampling_accept(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
-        llama_token id) {
+        llama_token id,
+        bool apply_grammar) {
     ctx_sampling->prev.erase(ctx_sampling->prev.begin());
     ctx_sampling->prev.push_back(id);
 
-    if (ctx_sampling->grammar != NULL) {
+    if (ctx_sampling->grammar != NULL && apply_grammar) {
         llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
     }
 }
diff --git a/common/sampling.h b/common/sampling.h
index 50afcbc12..62ea6d4cf 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -10,30 +10,30 @@
 
 // sampling parameters
 typedef struct llama_sampling_params {
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
     int32_t top_k             = 40;    // <= 0 to use vocab size
     float   top_p             = 0.95f; // 1.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typical_p         = 1.00f; // 1.0 = disabled
     float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.10f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
-
     bool    penalize_nl       = true;  // consider newlines as a repeatable token
 
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    std::string grammar;  // optional BNF-like grammar to constrain sampling
 
     // Classifier-Free Guidance
     // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt;   // string to help guidance
-    float       cfg_scale     = 1.f;   // How strong is guidance
+    std::string cfg_negative_prompt; // string to help guidance
+    float       cfg_scale     = 1.f; // how strong is guidance
 
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
 } llama_sampling_params;
 
 // general sampler context
@@ -58,7 +58,7 @@ struct llama_sampling_context {
 #include "common.h"
 
 // Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
 
 void llama_sampling_free(struct llama_sampling_context * ctx);
 
@@ -70,6 +70,15 @@ void llama_sampling_reset(llama_sampling_context * ctx);
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
 
+// Get the last sampled token
+llama_token llama_sampling_last(llama_sampling_context * ctx);
+
+// Get a string representation of the last sampled tokens
+std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
+
+// Print sampling parameters into a string
+std::string llama_sampling_print(const llama_sampling_params & params);
+
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
@@ -96,4 +105,5 @@ llama_token llama_sampling_sample(
 void llama_sampling_accept(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
-        llama_token id);
+        llama_token id,
+        bool apply_grammar);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e16c65f7c..75b8df676 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
 else()
-    add_subdirectory(main)
-    add_subdirectory(quantize)
-    add_subdirectory(quantize-stats)
-    add_subdirectory(perplexity)
-    add_subdirectory(embedding)
-    add_subdirectory(save-load-state)
-    add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
-    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(finetune)
-    add_subdirectory(convert-llama2c-to-ggml)
-    add_subdirectory(simple)
     add_subdirectory(batched)
     add_subdirectory(batched-bench)
-    add_subdirectory(speculative)
-    add_subdirectory(parallel)
-    add_subdirectory(embd-input)
-    add_subdirectory(llava)
-    add_subdirectory(llama-bench)
     add_subdirectory(beam-search)
+    add_subdirectory(benchmark)
+    add_subdirectory(convert-llama2c-to-ggml)
+    add_subdirectory(embedding)
+    add_subdirectory(finetune)
+    add_subdirectory(infill)
+    add_subdirectory(llama-bench)
+    add_subdirectory(llava)
+    add_subdirectory(main)
+    add_subdirectory(parallel)
+    add_subdirectory(perplexity)
+    add_subdirectory(quantize)
+    add_subdirectory(quantize-stats)
+    add_subdirectory(save-load-state)
+    add_subdirectory(simple)
+    add_subdirectory(speculative)
+    add_subdirectory(train-text-from-scratch)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()
diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore
deleted file mode 100644
index 87ef68771..000000000
--- a/examples/embd-input/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-PandaGPT
-MiniGPT-4
-*.pth
-
diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt
deleted file mode 100644
index 5bbb1ea02..000000000
--- a/examples/embd-input/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-set(TARGET embdinput)
-add_library(${TARGET} embd-input-lib.cpp embd-input.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
-
-set(TARGET embd-input-test)
-add_executable(${TARGET} embd-input-test.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md
deleted file mode 100644
index 5c4c75ea7..000000000
--- a/examples/embd-input/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-### Examples for input embedding directly
-
-## Requirement
-build  `libembdinput.so`
-run the following comman in main dir (../../).
-```
-make
-```
-
-## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
-
-1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
-2. Convert it to ggml format.
-3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
-
-```
-import torch
-
-bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
-pth_path = "./examples/embd-input/llava_projection.pth"
-
-dic = torch.load(bin_path)
-used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
-torch.save({k: dic[k] for k in used_key}, pth_path)
-```
-4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
-
-
-## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
-
-1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
-The `adapter_config.json` is
-```
-{
-  "peft_type": "LORA",
-  "fan_in_fan_out": false,
-  "bias": null,
-  "modules_to_save": null,
-  "r": 32,
-  "lora_alpha": 32,
-  "lora_dropout": 0.1,
-  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
-}
-```
-2. Papare the `vicuna` v0 model.
-3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
-4. Clone the PandaGPT source.
-```
-git clone https://github.com/yxuansu/PandaGPT
-```
-5. Install the requirement of PandaGPT.
-6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
-
-## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
-
-1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
-2. Clone the MiniGPT-4 source.
-```
-git clone https://github.com/Vision-CAIR/MiniGPT-4/
-```
-3. Install the requirement of PandaGPT.
-4. Papare the `vicuna` v0 model.
-5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
deleted file mode 100644
index 3ce33842c..000000000
--- a/examples/embd-input/embd-input-lib.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-#include "build-info.h"
-#include "common.h"
-#include "embd-input.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-static llama_context ** g_ctx;
-
-extern "C" {
-
-struct MyModel* create_mymodel(int argc, char ** argv) {
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return nullptr;
-    }
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = uint32_t(time(NULL));
-    }
-    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
-
-    llama_backend_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    g_ctx = &ctx;
-
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return nullptr;
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
-    }
-    struct MyModel * ret = new MyModel();
-    ret->ctx = ctx;
-    ret->params = params;
-    ret->n_past = 0;
-    // printf("ctx: %d\n", ret->ctx);
-    return ret;
-}
-
-void free_mymodel(struct MyModel * mymodel) {
-    llama_context * ctx = mymodel->ctx;
-    llama_print_timings(ctx);
-    llama_free(ctx);
-    delete mymodel;
-}
-
-
-bool eval_float(void * model, float * input, int N){
-    MyModel * mymodel = (MyModel*)model;
-    llama_context * ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(llama_get_model(ctx));
-    int n_past = mymodel->n_past;
-    int n_batch = N; // params.n_batch;
-
-    for (int i = 0; i < (int) N; i += n_batch) {
-        int n_eval = (int) N - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
-        if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-bool eval_tokens(void * model, std::vector<llama_token> tokens) {
-    MyModel * mymodel = (MyModel* )model;
-    llama_context * ctx;
-    ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    int n_past = mymodel->n_past;
-    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > params.n_batch) {
-            n_eval = params.n_batch;
-        }
-        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-bool eval_id(struct MyModel* mymodel, int id) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(mymodel, tokens);
-}
-
-bool eval_string(struct MyModel * mymodel,const char* str){
-    llama_context * ctx = mymodel->ctx;
-    std::string str2 = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
-    eval_tokens(mymodel, embd_inp);
-    return true;
-}
-
-llama_token sampling_id(struct MyModel* mymodel) {
-    llama_context* ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    llama_sampling_params & sparams = params.sampling_params;
-    // int n_ctx = llama_n_ctx(ctx);
-
-    // out of user input, sample next token
-    const float   temp            = sparams.temp;
-    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
-    const float   top_p           = sparams.top_p;
-    const float   tfs_z           = sparams.tfs_z;
-    const float   typical_p       = sparams.typical_p;
-    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    // const float   repeat_penalty  = params.repeat_penalty;
-    // const float   alpha_presence  = params.presence_penalty;
-    // const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = sparams.mirostat;
-    const float   mirostat_tau    = sparams.mirostat_tau;
-    const float   mirostat_eta    = sparams.mirostat_eta;
-    // const bool    penalize_nl     = params.penalize_nl;
-
-    llama_token id = 0;
-    {
-        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-        // Apply params.logit_bias map
-        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
-            logits[it->first] += it->second;
-        }
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl(ctx)];
-        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-        // llama_sample_repetition_penalty(ctx, &candidates_p,
-        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        //      last_n_repeat, repeat_penalty);
-        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        // last_n_repeat, alpha_frequency, alpha_presence);
-        // if (!penalize_nl) {
-        //     logits[llama_token_nl(ctx)] = nl_logit;
-        // }
-
-        if (temp <= 0) {
-            // Greedy sampling
-            id = llama_sample_token_greedy(ctx, &candidates_p);
-        } else {
-            if (mirostat == 1) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                const int mirostat_m = 100;
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-            } else if (mirostat == 2) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-            } else {
-                // Temperature sampling
-                llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                llama_sample_temp(ctx, &candidates_p, temp);
-                id = llama_sample_token(ctx, &candidates_p);
-            }
-        }
-    }
-
-    return id;
-}
-
-const char * sampling(struct MyModel * mymodel) {
-    llama_context * ctx = mymodel->ctx;
-    int id = sampling_id(mymodel);
-    static std::string ret;
-    if (id == llama_token_eos(ctx)) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_piece(ctx, id);
-    }
-    eval_id(mymodel, id);
-    return ret.c_str();
-}
-
-}
diff --git a/examples/embd-input/embd-input-test.cpp b/examples/embd-input/embd-input-test.cpp
deleted file mode 100644
index dc4a0e488..000000000
--- a/examples/embd-input/embd-input-test.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "embd-input.h"
-#include <stdlib.h>
-#include <random>
-#include <string.h>
-
-int main(int argc, char** argv) {
-
-    auto mymodel = create_mymodel(argc, argv);
-    int N = 10;
-    int max_tgt_len = 500;
-    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
-
-    // add random float embd to test evaluation
-    float * data = new float[N*n_embd];
-    std::default_random_engine e;
-    std::uniform_real_distribution<float>  u(0,1);
-    for (int i=0;i<N*n_embd;i++) {
-        data[i] = u(e);
-    }
-
-    eval_string(mymodel, "user: what is the color of the flag of UN?");
-    eval_float(mymodel, data, N);
-    eval_string(mymodel, "assistant:");
-    eval_string(mymodel, mymodel->params.prompt.c_str());
-    const char* tmp;
-    for (int i=0; i<max_tgt_len; i++) {
-        tmp = sampling(mymodel);
-        if (strcmp(tmp, "</s>")==0) break;
-        printf("%s", tmp);
-        fflush(stdout);
-    }
-    printf("\n");
-    free_mymodel(mymodel);
-    return 0;
-}
diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h
deleted file mode 100644
index eff5e3b84..000000000
--- a/examples/embd-input/embd-input.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _EMBD_INPUT_H_
-#define _EMBD_INPUT_H_ 1
-
-#include "common.h"
-#include "llama.h"
-
-extern "C" {
-
-typedef struct MyModel {
-    llama_context* ctx;
-    gpt_params params;
-    int n_past = 0;
-} MyModel;
-
-struct MyModel* create_mymodel(int argc, char ** argv);
-
-bool eval_float(void* model, float* input, int N);
-bool eval_tokens(void* model, std::vector<llama_token> tokens);
-bool eval_id(struct MyModel* mymodel, int id);
-bool eval_string(struct MyModel* mymodel, const char* str);
-const char * sampling(struct MyModel* mymodel);
-llama_token sampling_id(struct MyModel* mymodel);
-void free_mymodel(struct MyModel* mymodel);
-
-}
-
-#endif
diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py
deleted file mode 100755
index f146acdc1..000000000
--- a/examples/embd-input/embd_input.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-import ctypes
-from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
-import numpy as np
-import os
-
-libc = cdll.LoadLibrary("./libembdinput.so")
-libc.sampling.restype=c_char_p
-libc.create_mymodel.restype=c_void_p
-libc.eval_string.argtypes=[c_void_p, c_char_p]
-libc.sampling.argtypes=[c_void_p]
-libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
-
-
-class MyModel:
-    def __init__(self, args):
-        argc = len(args)
-        c_str = [c_char_p(i.encode()) for i in args]
-        args_c = (c_char_p * argc)(*c_str)
-        self.model = c_void_p(libc.create_mymodel(argc, args_c))
-        self.max_tgt_len = 512
-        self.print_string_eval = True
-
-    def __del__(self):
-        libc.free_mymodel(self.model)
-
-    def eval_float(self, x):
-        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
-
-    def eval_string(self, x):
-        libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
-        if self.print_string_eval:
-            print(x)
-
-    def eval_token(self, x):
-        libc.eval_id(self.model, x)
-
-    def sampling(self):
-        s = libc.sampling(self.model)
-        return s
-
-    def stream_generate(self, end="</s>"):
-        ret = b""
-        end = end.encode()
-        for _ in range(self.max_tgt_len):
-            tmp = self.sampling()
-            ret += tmp
-            yield tmp
-            if ret.endswith(end):
-                break
-
-    def generate_with_print(self, end="</s>"):
-        ret = b""
-        for i in self.stream_generate(end=end):
-            ret += i
-            print(i.decode(errors="replace"), end="", flush=True)
-        print("")
-        return ret.decode(errors="replace")
-
-
-    def generate(self, end="</s>"):
-        text = b"".join(self.stream_generate(end=end))
-        return text.decode(errors="replace")
-
-if __name__ == "__main__":
-    model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
-    model.eval_string("""user: what is the color of the flag of UN?""")
-    x = np.random.random((5120,10))# , dtype=np.float32)
-    model.eval_float(x)
-    model.eval_string("""assistant:""")
-    for i in model.generate():
-        print(i.decode(errors="replace"), end="", flush=True)
diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py
deleted file mode 100755
index 06fad55f4..000000000
--- a/examples/embd-input/llava.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from transformers import CLIPVisionModel,  CLIPImageProcessor
-from PIL import Image
-
-# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
-vision_tower = "openai/clip-vit-large-patch14"
-select_hidden_state_layer = -2
-# (vision_config.image_size // vision_config.patch_size) ** 2
-image_token_len = (224//14)**2
-
-class Llava:
-    def __init__(self, args):
-        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
-        self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
-        self.mm_projector = nn.Linear(1024, 5120)
-        self.model = MyModel(["main", *args])
-
-    def load_projection(self, path):
-        state = torch.load(path)
-        self.mm_projector.load_state_dict({
-            "weight": state["model.mm_projector.weight"],
-            "bias": state["model.mm_projector.bias"]})
-
-    def chat(self, question):
-        self.model.eval_string("user: ")
-        self.model.eval_string(question)
-        self.model.eval_string("\nassistant: ")
-        return self.model.generate_with_print()
-
-    def chat_with_image(self, image, question):
-        with torch.no_grad():
-            embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-            image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
-            select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
-            image_feature = select_hidden_state[:, 1:]
-            embd_image = self.mm_projector(image_feature)
-            embd_image = embd_image.cpu().numpy()[0]
-        self.model.eval_string("user: ")
-        self.model.eval_token(32003-2) # im_start
-        self.model.eval_float(embd_image.T)
-        for i in range(image_token_len-embd_image.shape[0]):
-            self.model.eval_token(32003-3) # im_patch
-        self.model.eval_token(32003-1) # im_end
-        self.model.eval_string(question)
-        self.model.eval_string("\nassistant: ")
-        return self.model.generate_with_print()
-
-
-if __name__=="__main__":
-    # model form liuhaotian/LLaVA-13b-delta-v1-1
-    a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
-    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
-    # Also here can use pytorch_model-00003-of-00003.bin directly.
-    a.load_projection(os.path.join(
-        os.path.dirname(__file__) ,
-        "llava_projection.pth"))
-    respose = a.chat_with_image(
-        Image.open("./media/llama1-logo.png").convert('RGB'),
-        "what is the text in the picture?")
-    respose
-    a.chat("what is the color of it?")
-
-
-
diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py
deleted file mode 100755
index 7b13e4a5c..000000000
--- a/examples/embd-input/minigpt4.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-from PIL import Image
-
-minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
-sys.path.insert(0, minigpt4_path)
-from minigpt4.models.blip2 import Blip2Base
-from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
-
-
-class MiniGPT4(Blip2Base):
-    """
-    MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
-    """
-    def __init__(self,
-        args,
-        vit_model="eva_clip_g",
-        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
-        img_size=224,
-        drop_path_rate=0,
-        use_grad_checkpoint=False,
-        vit_precision="fp32",
-        freeze_vit=True,
-        freeze_qformer=True,
-        num_query_token=32,
-        llama_model="",
-        prompt_path="",
-        prompt_template="",
-        max_txt_len=32,
-        end_sym='\n',
-        low_resource=False,  # use 8 bit and put vit in cpu
-        device_8bit=0
-    ):
-        super().__init__()
-        self.img_size = img_size
-        self.low_resource = low_resource
-        self.preprocessor = Blip2ImageEvalProcessor(img_size)
-
-        print('Loading VIT')
-        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
-            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
-        )
-        print('Loading VIT Done')
-        print('Loading Q-Former')
-        self.Qformer, self.query_tokens = self.init_Qformer(
-            num_query_token, self.visual_encoder.num_features
-        )
-        self.Qformer.cls = None
-        self.Qformer.bert.embeddings.word_embeddings = None
-        self.Qformer.bert.embeddings.position_embeddings = None
-        for layer in self.Qformer.bert.encoder.layer:
-            layer.output = None
-            layer.intermediate = None
-        self.load_from_pretrained(url_or_filename=q_former_model)
-        print('Loading Q-Former Done')
-        self.llama_proj = nn.Linear(
-            self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
-        )
-        self.max_txt_len = max_txt_len
-        self.end_sym = end_sym
-        self.model = MyModel(["main", *args])
-        # system prompt
-        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
-           "You will be able to see the image once I provide it to you. Please answer my questions."
-           "###")
-
-    def encode_img(self, image):
-        image = self.preprocessor(image)
-        image = image.unsqueeze(0)
-        device = image.device
-        if self.low_resource:
-            self.vit_to_cpu()
-            image = image.to("cpu")
-
-        with self.maybe_autocast():
-            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
-            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
-
-            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
-            query_output = self.Qformer.bert(
-                query_embeds=query_tokens,
-                encoder_hidden_states=image_embeds,
-                encoder_attention_mask=image_atts,
-                return_dict=True,
-            )
-
-            inputs_llama = self.llama_proj(query_output.last_hidden_state)
-            # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
-        return inputs_llama
-
-    def load_projection(self, path):
-        state = torch.load(path)["model"]
-        self.llama_proj.load_state_dict({
-            "weight": state["llama_proj.weight"],
-            "bias": state["llama_proj.bias"]})
-
-    def chat(self, question):
-        self.model.eval_string("Human: ")
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        return self.model.generate_with_print(end="###")
-
-    def chat_with_image(self, image, question):
-        with torch.no_grad():
-            embd_image = self.encode_img(image)
-        embd_image = embd_image.cpu().numpy()[0]
-        self.model.eval_string("Human: <Img>")
-        self.model.eval_float(embd_image.T)
-        self.model.eval_string("</Img> ")
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        return self.model.generate_with_print(end="###")
-
-
-if __name__=="__main__":
-    a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
-    a.load_projection(os.path.join(
-        os.path.dirname(__file__) ,
-        "pretrained_minigpt4.pth"))
-    respose = a.chat_with_image(
-        Image.open("./media/llama1-logo.png").convert('RGB'),
-        "what is the text in the picture?")
-    a.chat("what is the color of it?")
diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py
deleted file mode 100755
index 891ad7cc9..000000000
--- a/examples/embd-input/panda_gpt.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from embd_input import MyModel
-import numpy as np
-from torch import nn
-import torch
-
-# use PandaGPT path
-panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
-imagebind_ckpt_path = "./models/panda_gpt/"
-
-sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
-from ImageBind.models import imagebind_model
-from ImageBind import data
-
-ModalityType = imagebind_model.ModalityType
-max_tgt_len = 400
-
-class PandaGPT:
-    def __init__(self, args):
-        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
-        self.visual_encoder.eval()
-        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
-        self.max_tgt_len = max_tgt_len
-        self.model = MyModel(["main", *args])
-        self.generated_text = ""
-        self.device = "cpu"
-
-    def load_projection(self, path):
-        state = torch.load(path, map_location="cpu")
-        self.llama_proj.load_state_dict({
-            "weight": state["llama_proj.weight"],
-            "bias": state["llama_proj.bias"]})
-
-    def eval_inputs(self, inputs):
-        self.model.eval_string("<Img>")
-        embds = self.extract_multimoal_feature(inputs)
-        for i in embds:
-            self.model.eval_float(i.T)
-        self.model.eval_string("</Img> ")
-
-    def chat(self, question):
-        return self.chat_with_image(None, question)
-
-    def chat_with_image(self, inputs, question):
-        if self.generated_text == "":
-            self.model.eval_string("###")
-        self.model.eval_string(" Human: ")
-        if inputs:
-            self.eval_inputs(inputs)
-        self.model.eval_string(question)
-        self.model.eval_string("\n### Assistant:")
-        ret = self.model.generate_with_print(end="###")
-        self.generated_text += ret
-        return ret
-
-    def extract_multimoal_feature(self, inputs):
-        features = []
-        for key in ["image", "audio", "video", "thermal"]:
-            if key + "_paths" in inputs:
-                embeds = self.encode_data(key, inputs[key+"_paths"])
-                features.append(embeds)
-        return features
-
-    def encode_data(self, data_type, data_paths):
-
-        type_map = {
-            "image": ModalityType.VISION,
-            "audio": ModalityType.AUDIO,
-            "video": ModalityType.VISION,
-            "thermal": ModalityType.THERMAL,
-        }
-        load_map = {
-            "image": data.load_and_transform_vision_data,
-            "audio": data.load_and_transform_audio_data,
-            "video": data.load_and_transform_video_data,
-            "thermal": data.load_and_transform_thermal_data
-        }
-
-        load_function = load_map[data_type]
-        key = type_map[data_type]
-
-        inputs = {key: load_function(data_paths, self.device)}
-        with torch.no_grad():
-            embeddings = self.visual_encoder(inputs)
-            embeds = embeddings[key]
-            embeds = self.llama_proj(embeds).cpu().numpy()
-        return embeds
-
-
-if __name__=="__main__":
-    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
-    a.load_projection("./models/panda_gpt/adapter_model.bin")
-    a.chat_with_image(
-        {"image_paths": ["./media/llama1-logo.png"]},
-        "what is the text in the picture? 'llama' or 'lambda'?")
-    a.chat("what is the color of it?")
diff --git a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
deleted file mode 100644
index 9d433f4b1..000000000
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ /dev/null
@@ -1,1133 +0,0 @@
-#ifndef CMPNCT_GPT2BPE
-#define CMPNCT_GPT2BPE
-
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <utility>
-#include <iostream>
-#include <map>
-#include <unordered_map>
-#include <queue>
-#include <cstring>
-
-
-// Unicode GPT2 Byte Pair Encoding Tokenizer
-// Adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
-// Removed loading of merges from HF json and parts made for a specific vocab
-
-
-//-----------------
-// Unicode library (from cmpnct_unicode.cpp)
-//-----------------
-
-// Minimal library for high performance handling and categorization of UTF8 strings and characters
-// Using std::string
-
-enum CNCTCharType {
-    DIGIT,          // a numerical char in any language
-    LETTER,         // a letter in any language
-    WHITESPACE,     // any form of whitespace
-    ACCENT_MARK,    // letter modifiers like ´ in é
-    PUNCTUATION,    // punctuation including brackets
-    SYMBOL,         // math, currency, other symbols
-    CONTROL,        // control characters
-    MIXED,          // a mix of the above
-    UNIDENTIFIED    // something more exotic like emoji or separators
-};
-
-struct CNCTUnicode;
-
-struct CNCTString {
-    std::string str;
-    size_t utf8_chars;
-
-    CNCTCharType char_type=UNIDENTIFIED;
-    bool is_sequential=false;
-
-    size_t seq_offset_bytes=0;
-    size_t seq_offset_utf8_chars=0;
-
-    bool operator==(const std::string &other) const;
-    bool operator==(const char other) const;
-    bool operator==(const CNCTString &other) const;
-    CNCTString &operator+=(const std::string &other);
-    CNCTString &operator+=(const char other);
-    friend CNCTString operator+(CNCTString lhs, const std::string &rhs);
-    friend CNCTString operator+(CNCTString lhs, const char rhs);
-    CNCTString& operator+=(const CNCTString& other);
-    friend CNCTString operator+(CNCTString lhs, const CNCTString& rhs);
-};
-
-struct CNCTUnicode {
-    static bool check_code_range(int c, const std::vector<std::pair<int, int>>& ranges);
-    static CNCTCharType get_code_type(int c);
-    static CNCTCharType get_code_type(const std::string &utf8_char);
-    static int utf8_len(const char c);
-    static int strlen_utf8(std::string src);
-    static std::vector<std::string> split_utf8(const std::string &src);
-    static std::vector<CNCTString> split_utf8_enhanced(const std::string &src);
-    static CNCTCharType string_identify(const std::string& str);
-    static bool string_test(const std::string& str, CNCTCharType chartype);
-};
-
-static const std::vector<std::pair<int, int>> digit_ranges = {
-{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
-{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
-{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
-{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
-{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
-{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
-{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
-{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
-};
-
-static const std::vector<std::pair<int, int>> letter_ranges = {
-{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
-{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
-{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
-{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
-{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
-{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
-{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
-{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
-{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
-{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
-{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
-{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
-{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
-{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
-{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
-{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
-{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
-{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
-{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
-{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
-{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
-{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
-{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
-{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
-{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
-{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
-{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
-{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
-{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
-{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
-{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
-{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
-{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
-{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
-{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
-{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
-{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
-{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
-{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
-{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
-{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
-{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
-{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
-{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
-{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
-{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
-{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
-{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
-{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
-{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
-{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
-{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
-{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
-{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
-{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
-{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
-{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
-{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
-};
-
-static const std::vector<std::pair<int, int>> whitespace_ranges = {
-{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
-};
-
-static const std::vector<std::pair<int, int>> accent_mark_ranges = {
-{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
-{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
-{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
-{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
-{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
-{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
-{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
-{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
-{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
-{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
-{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
-{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
-{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
-{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
-{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
-{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
-{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
-{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
-{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
-{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
-{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
-{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
-{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
-{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
-{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
-{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
-{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
-};
-
-static const std::vector<std::pair<int, int>> punctuation_ranges = {
-{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
-{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
-{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
-{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
-{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
-{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
-{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
-{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
-{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
-{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
-{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
-{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
-{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
-{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
-{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
-{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
-{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
-};
-
-static const std::vector<std::pair<int, int>> symbol_ranges = {
-{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
-{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
-{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
-{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
-{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
-{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
-{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
-{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
-{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
-{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
-{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
-{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
-{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
-{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
-{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
-{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
-{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
-{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
-{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
-{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
-{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
-};
-
-static const std::vector<std::pair<int, int>> control_ranges = {
-{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
-{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
-{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
-{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
-{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
-{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
-{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
-{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
-{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
-{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
-{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
-{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
-{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
-{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
-{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
-{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
-{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
-{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
-{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
-{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
-{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
-{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
-{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
-{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
-{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
-{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
-{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
-{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
-{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
-{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
-{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
-{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
-{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
-{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
-{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
-{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
-{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
-{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
-{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
-{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
-{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
-{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
-{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
-{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
-{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
-{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
-{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
-{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
-{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
-{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
-{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
-{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
-{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
-{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
-{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
-{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
-{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
-{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
-{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
-{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
-{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
-{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
-{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
-{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
-};
-
-//String
-bool CNCTString::operator==(const std::string& other) const {
-    return str.compare(other) == 0;
-}
-bool CNCTString::operator==(const char other) const {
-    return str.compare(std::string(1, other)) == 0;
-}
-bool CNCTString::operator==(const CNCTString& other) const {
-    return str.compare(other.str) == 0;
-}
-// + operators
-CNCTString& CNCTString::operator+=(const std::string& other) {
-    str += other;
-    int new_len = CNCTUnicode::strlen_utf8(other);
-    utf8_chars += new_len;
-    char_type = CNCTUnicode::string_identify(str);
-    seq_offset_bytes += other.size();
-    seq_offset_utf8_chars += new_len;
-    return *this;
-}
-
-CNCTString& CNCTString::operator+=(const char other) {
-    std::string str = std::string(1, other);
-    *this += str;
-    return *this;
-}
-
-CNCTString& CNCTString::operator+=(const CNCTString& other) {
-    str += other.str;
-    utf8_chars += other.utf8_chars;
-    char_type = CNCTUnicode::string_identify(str);
-    seq_offset_bytes += other.str.size();
-    seq_offset_utf8_chars += other.utf8_chars;
-    return *this;
-}
-
-struct CRCompare {
-    bool operator()(const std::pair<int, int>& p, int i) {
-        return p.second < i;
-    }
-    bool operator()(int i, const std::pair<int, int>& p) {
-        return i < p.first;
-    }
-};
-
-// binary search for code range
-bool CNCTUnicode::check_code_range(int c, const std::vector<std::pair<int, int>> &ranges) {
-    auto it = std::upper_bound(ranges.begin(), ranges.end(), c, CRCompare());
-    if (it != ranges.begin()) {
-        --it;
-    }
-    return c >= it->first && c <= it->second;
-}
-
-// these are binary searches, it takes only a few operations
-CNCTCharType CNCTUnicode::get_code_type(int c) {
-    if (check_code_range(c, letter_ranges)) {
-        return LETTER;
-    }
-    if (check_code_range(c, digit_ranges)) {
-        return DIGIT;
-    }
-    if (check_code_range(c, whitespace_ranges)) {
-        return WHITESPACE;
-    }
-    if (check_code_range(c, punctuation_ranges)) {
-        return PUNCTUATION;
-    }
-    if (check_code_range(c, symbol_ranges)) {
-        return SYMBOL;
-    }
-    if (check_code_range(c, accent_mark_ranges)) {
-        return ACCENT_MARK;
-    }
-    if (check_code_range(c, control_ranges)) {
-        return CONTROL;
-    }
-    return UNIDENTIFIED;
-}
-
-static int utf8_to_unicode(const std::string& utf8_char) {
-    int c = 0;
-    int len = (int)utf8_char.size();
-    if (len == 1) {
-        c = utf8_char[0];
-    } else if (len == 2) {
-        c = ((utf8_char[0] & 0x1F) << 6) | (utf8_char[1] & 0x3F);
-    } else if (len == 3) {
-        c = ((utf8_char[0] & 0x0F) << 12) | ((utf8_char[1] & 0x3F) << 6) | (utf8_char[2] & 0x3F);
-    } else if (len == 4) {
-        c = ((utf8_char[0] & 0x07) << 18) | ((utf8_char[1] & 0x3F) << 12) | ((utf8_char[2] & 0x3F) << 6) | (utf8_char[3] & 0x3F);
-    }
-    return c;
-}
-
-CNCTCharType CNCTUnicode::get_code_type(const std::string &utf8_char) {
-    return get_code_type(utf8_to_unicode(utf8_char));
-}
-
-int CNCTUnicode::utf8_len(const char c)
-{
-    if ((c & 0x80) == 0) {
-        return 1; // ASCII character
-    }
-    if ((c & 0xE0) == 0xC0) {
-        return 2; // 2-byte character
-    }
-    if ((c & 0xF0) == 0xE0) {
-        return 3; // 3-byte character
-    }
-    if ((c & 0xF0) == 0xF0) {
-        return 4; // 4-byte character
-    }
-    return 1;     // not valid utf8
-    // static const uint8_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    // return lookup[static_cast<uint8_t>(c) >> 4];
-}
-
-int CNCTUnicode::strlen_utf8(const std::string src) {
-    int len = 0;
-    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
-        int char_len = utf8_len(*it);
-        if (char_len > 1) {
-            it += char_len - 1;
-        }
-        len += 1;
-    }
-    return len;
-}
-
-// split a string into unicode strings
-std::vector<std::string> CNCTUnicode::split_utf8(const std::string &src) {
-    std::vector<std::string> result;
-    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
-        int char_len = utf8_len(*it);
-        std::string str(it, it + char_len);
-        result.push_back(str);
-        if (char_len > 1) {
-            it += char_len - 1;
-        }
-    }
-    return result;
-}
-
-// split a string into unicode strings (CNCTString) with sequence information
-std::vector<CNCTString> CNCTUnicode::split_utf8_enhanced(const std::string &src) {
-    std::vector<CNCTString> result;
-    int seq_offset_bytes=0;
-    int seq_offset_utf8_chars=0;
-    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
-        int char_len = utf8_len(*it);
-        std::string str(it, it + char_len);
-        CNCTString cnct_str;
-        cnct_str.seq_offset_bytes = seq_offset_bytes;
-        cnct_str.seq_offset_utf8_chars = seq_offset_utf8_chars;
-        cnct_str.str = str;
-        cnct_str.utf8_chars = 1;
-        cnct_str.char_type = get_code_type(str);
-        #if 0
-        switch (cnct_str.char_type)
-        {
-        case DIGIT:
-            printf("%s = DIGIT\n", str.c_str());
-            break;
-        case LETTER:
-            printf("%s = LETTER\n", str.c_str());
-            break;
-        case WHITESPACE:
-            printf("%s = WHITESPACE\n", str.c_str());
-            break;
-        case PUNCTUATION:
-            printf("%s = PUNCTUATION\n", str.c_str());
-            break;
-        case UNIDENTIFIED:
-            printf("%s = UNIDENTIFIED\n", str.c_str());
-            break;
-        case SYMBOL:
-            printf("%s = SYMBOL\n", str.c_str());
-            break;
-        case CONTROL:
-            printf("%s = CONTROL\n", str.c_str());
-            break;
-        }
-        #endif
-
-        result.push_back(cnct_str);
-        seq_offset_bytes += char_len;
-        seq_offset_utf8_chars += 1;
-        if (char_len > 1) {
-            it += char_len - 1;
-        }
-
-    }
-    return result;
-}
-
-// return the type of the string
-CNCTCharType CNCTUnicode::string_identify(const std::string &str) {
-    CNCTCharType result = UNIDENTIFIED;
-    std::string::const_iterator it = str.begin();
-    while (it != str.end()) {
-        int len = utf8_len(*it);
-        int c = 0;
-        for (int i = 0; i < len && it != str.end(); ++i, ++it) {
-            c = (c << 8) | static_cast<unsigned char>(*it);
-        }
-        switch (get_code_type(c)) {
-        case DIGIT:
-            if (result == UNIDENTIFIED) {
-                result = DIGIT;
-            } else if (result != DIGIT) {
-                return MIXED;
-            }
-            break;
-        case LETTER:
-            if (result == UNIDENTIFIED) {
-                result = LETTER;
-            } else if (result != LETTER) {
-                return MIXED;
-            }
-            break;
-        case WHITESPACE:
-            if (result == UNIDENTIFIED) {
-                result = WHITESPACE;
-            } else if (result != WHITESPACE) {
-                return MIXED;
-            }
-            break;
-        case PUNCTUATION:
-            if (result == UNIDENTIFIED) {
-                result = PUNCTUATION;
-            } else if (result != PUNCTUATION) {
-                return MIXED;
-            }
-            break;
-        default:
-            return MIXED;
-            break;
-        }
-    }
-    return result;
-}
-
-// verify the content of a string
-bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype)
-{
-    std::string::const_iterator it = str.begin();
-    while (it != str.end()) {
-        int len = utf8_len(*it);
-        int c = 0;
-        for (int i = 0; i < len && it != str.end(); ++i, ++it) {
-            c = (c << 8) | static_cast<unsigned char>(*it);
-        }
-        if (get_code_type(c) != chartype) {
-            return false;
-        }
-    }
-    return true;
-}
-
-//-----------------
-// llama.cpp GPT2 vocab (from libfalcon.cpp)
-//-----------------
-
-std::string replaceAll(std::string str, const std::string& from, const std::string& to) {
-    size_t start_pos = 0;
-    while((start_pos = str.find(from, start_pos)) != std::string::npos) {
-        str.replace(start_pos, from.length(), to);
-        start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
-    }
-    return str;
-}
-
-struct TrieNode {
-    std::map<char, TrieNode*> map;
-    int32_t Id = -1;
-};
-
-struct Trie {
-    TrieNode *root;
-
-    Trie() : root(new TrieNode()) {}
-
-    ~Trie() {
-        if(root)
-        deleteTrie(root);
-    }
-
-    // Move constructor
-    Trie(Trie&& other) noexcept : root(other.root) {
-        other.root = nullptr;
-    }
-
-    // Move assignment operator
-    Trie& operator=(Trie&& other) noexcept {
-        if (this != &other) {
-            if(root)
-                deleteTrie(root);
-            root = other.root;
-            other.root = nullptr;
-        }
-        return *this;
-    }
-
-    void insert(const std::string &token, int32_t Id) {
-        TrieNode* current = root;
-        for(auto ch : token) {
-            if(current->map.find(ch) == current->map.end()) {
-                current->map[ch] = new TrieNode();
-            }
-            current = current->map[ch];
-        }
-        current->Id = Id;
-    }
-
-    void reset() {
-        deleteTrie(root);
-        root = new TrieNode();
-    }
-
-private:
-    void deleteTrie(TrieNode* node) {
-        for(auto &it: node->map) {
-            deleteTrie(it.second);
-        }
-        delete node;
-    }
-
-};
-
-struct gpt2bpe_vocab {
-    using id = int32_t;
-    using token = std::string;
-
-    std::map<std::string, uint32_t> max_token_length; // max length, for each 2byte prefix
-    std::map<std::pair<std::string,std::string>, int> bpe_ranks;
-    std::vector<std::pair<std::string, std::string>> bpe_merges;
-
-    id special_bos_id = -1;
-    id special_eos_id = -1;
-    id special_unk_id = -1;
-    id special_sep_id = -1;
-    id special_pad_id = -1;
-
-    id linefeed_id = -1;
-
-    std::unordered_map<token, id> token_to_id;
-    std::unordered_map<id, token> id_to_token;
-
-    Trie trie; // highspeed access to tokens by prefix tree
-
-    // populate trie from map
-    void populate_trie_from_map() {
-        trie.reset();
-        for (const auto& pair : token_to_id) {
-            trie.insert(pair.first, pair.second);
-            if (pair.first.size() >= 2) {
-                std::string prefix = pair.first.substr(0, 2);
-                max_token_length[prefix] = std::max(max_token_length[prefix], (uint32_t)pair.first.size());
-            }
-        }
-    }
-    // populate token ranks map
-    int populate_bpe_ranks(std::vector<std::pair<std::string, std::string>> bpe_merges_) {
-        for (int i = 0; i < (int)bpe_merges_.size(); i++) {
-            bpe_ranks.emplace(bpe_merges_[i], i);
-        }
-        bpe_merges = bpe_merges_;
-        return bpe_merges_.size();
-    }
-
-    // Trim whitespace characters from the beginning and end of the string
-    void trim(std::string& str) {
-        // Remove whitespace characters from the beginning of the string
-        str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int ch) {
-            return !std::isspace(ch);
-        }));
-
-        // Remove whitespace characters from the end of the string
-        str.erase(std::find_if(str.rbegin(), str.rend(), [](int ch) {
-            return !std::isspace(ch);
-        }).base(), str.end());
-    }
-
-    // get max token length available for a prefix of 2 bytes (string at least 2 bytes long)
-    int get_max_token_length(const std::string& string) const {
-        if (string.size() < 2) {
-            return -1;
-        }
-        std::string prefix = string.substr(0, 2);
-        if (max_token_length.find(prefix) == max_token_length.end()) {
-            return 0;
-        }
-        return max_token_length.at(prefix);
-    }
-
-    // function to find if two tokens match in bpe_rank, return rank or -1
-    int find_bpe_rank(const std::string& token1, const std::string& token2) const {
-        std::string left_token = token1;
-        std::string right_token = token2;
-        left_token = replaceAll(left_token, " ", "Ġ");
-        left_token = replaceAll(left_token, "\n", "Ċ");
-        right_token = replaceAll(right_token, " ", "Ġ");
-        right_token = replaceAll(right_token, "\n", "Ċ");
-
-        auto it = bpe_ranks.find(std::make_pair(left_token, right_token));
-        if (it == bpe_ranks.end()) {
-            return -1;
-        }
-        return it->second;
-    }
-
-    std::pair<gpt2bpe_vocab::id, std::string> find_longest_match(const std::string& snippet) const {
-        TrieNode* current = trie.root;
-        gpt2bpe_vocab::id last_matched_id = -1;
-        std::string last_matched_token = "";
-        std::string current_token = "";
-        for (auto ch : snippet) {
-            if (current->map.find(ch) == current->map.end()) {
-                break;
-            }
-            current = current->map[ch];
-            current_token += ch;
-            if (current->Id != -1) {
-                last_matched_id = current->Id;
-                last_matched_token = current_token;
-            }
-        }
-        return {last_matched_id, last_matched_token};
-    }
-
-};
-
-
-//
-// tokenizer - bpe type, gpt2 tokenization compatible
-//
-
-struct ggllm_bpe_symbol {
-    using index = int;
-    index prev;
-    index next;
-    const char * text;
-    size_t n;
-};
-
-static_assert(std::is_trivially_copyable<ggllm_bpe_symbol>::value, "ggllm_bpe_symbol is not trivially copyable");
-
-struct ggllm_bpe_bigram {
-    struct comparator {
-        bool operator()(ggllm_bpe_bigram & l, ggllm_bpe_bigram & r) {
-            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
-        }
-    };
-
-    using queue_storage = std::vector<ggllm_bpe_bigram>;
-    using queue = std::priority_queue<ggllm_bpe_bigram, queue_storage, comparator>;
-    ggllm_bpe_symbol::index left;
-    ggllm_bpe_symbol::index right;
-    std::string text;
-    int rank;
-    size_t size;
-};
-
-struct gpt2bpe_tokenizer {
-    gpt2bpe_tokenizer(const gpt2bpe_vocab & vocab, bool g2ws_): vocab_(vocab) { flag_g2ws = g2ws_; }
-
-    void tokenize(const std::string & text, std::vector<gpt2bpe_vocab::id> & output) {
-        int final_prev_index = -1;
-        // auto start = ggml_time_us();
-        auto word_collection = bpe_gpt2_preprocess(text);
-        // auto end = ggml_time_us();
-        // fprintf(stderr, "%s: preprocessing took %0.3f ms\n", __func__, (end - start) / 1000.0);
-
-        symbols_final.clear();
-
-        for (auto & word : word_collection) {
-            work_queue_ = ggllm_bpe_bigram::queue();
-            symbols_.clear();
-
-            int index = 0;
-            size_t offset = 0;
-
-            while (offset < word.size()) {
-                ggllm_bpe_symbol sym;
-                size_t char_len = std::min(word.size() - offset, (size_t) CNCTUnicode::utf8_len(word[offset]));
-                sym.text = word.c_str() + offset;
-                sym.n = 1;
-                sym.n = char_len;
-                offset += sym.n;
-                sym.prev = index - 1;
-                sym.next = offset == word.size() ? -1 : index + 1;
-                index++;
-                symbols_.emplace_back(sym);
-            }
-            for (size_t i = 1; i < symbols_.size(); ++i) {
-                add_new_bigram(i - 1, i);
-            }
-
-            // build token(s)
-            while (!work_queue_.empty()) {
-                auto bigram = work_queue_.top();
-                work_queue_.pop();
-
-                auto & left_symbol = symbols_[bigram.left];
-                auto & right_symbol = symbols_[bigram.right];
-
-                if (left_symbol.n == 0 || right_symbol.n == 0) {
-                    continue;
-                }
-                std::string left_token = std::string(left_symbol.text, left_symbol.n);
-                std::string right_token = std::string(right_symbol.text, right_symbol.n);
-                if (left_token + right_token != bigram.text) {
-                    continue;  // Skip this bigram if it's outdated
-                }
-
-                // merge the right sym into the left one
-                left_symbol.n += right_symbol.n;
-                right_symbol.n = 0;
-
-                // remove the right sym from the chain
-                left_symbol.next = right_symbol.next;
-                if (right_symbol.next >= 0) {
-                    symbols_[right_symbol.next].prev = bigram.left;
-                }
-
-                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
-                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
-            }
-
-            // add the fnished tokens to the final list keeping correct order for next and prev
-            for (auto & sym : symbols_) {
-                if (sym.n > 0) {
-                    sym.prev = final_prev_index;
-                    sym.next = -1;
-                    if (final_prev_index != -1) {
-                        symbols_final[final_prev_index].next = symbols_final.size();
-                    }
-                    symbols_final.emplace_back(sym);
-                    final_prev_index = symbols_final.size() - 1;
-                }
-            }
-        }
-
-        symbols_ = symbols_final;
-        if (symbols_.size())
-        for (int i = 0; i != -1; i = symbols_[i].next) {
-            auto & symbol = symbols_[i];
-            if (symbol.n == 0) {
-                continue;
-            }
-            std::string str = std::string(symbol.text, symbol.n);
-            std::string str_decoded = decode_token(str);
-            auto token = vocab_.token_to_id.find(str_decoded);
-
-            if (token == vocab_.token_to_id.end()) {
-                for (auto j = str_decoded.begin(); j != str_decoded.end(); ++j) {
-                    std::string byte_str(1, *j);
-                    auto token_multibyte = vocab_.token_to_id.find(byte_str);
-                    if (token_multibyte == vocab_.token_to_id.end()) {
-                        fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
-                    }
-                    output.push_back((*token_multibyte).second);
-                }
-            } else {
-                output.push_back((*token).second);
-            }
-        }
-    }
-
-private:
-    void add_new_bigram(int left, int right) {
-        if (left == -1 || right == -1)  return;
-
-        std::string left_token = std::string(symbols_[left].text, symbols_[left].n);
-        std::string right_token = std::string(symbols_[right].text, symbols_[right].n);
-
-        int rank_found = -1;
-        rank_found = vocab_.find_bpe_rank(left_token, right_token);
-
-        if (rank_found < 0) {
-            return;
-        }
-
-        ggllm_bpe_bigram bigram;
-        bigram.left = left;
-        bigram.right = right;
-        bigram.rank = rank_found;
-        bigram.size = left_token.size() + right_token.size();
-        bigram.text = left_token + right_token;
-        work_queue_.push(bigram);
-    }
-
-    std::unordered_map<unsigned char, std::string> bytes_to_unicode() {
-        static std::unordered_map<unsigned char, std::string> hex_map = {
-            { 0x21, "\x21" }, { 0x22, "\x22" }, { 0x23, "\x23" }, { 0x24, "\x24" }, { 0x25, "\x25" }, { 0x26, "\x26" }, { 0x27, "\x27" }, { 0x28, "\x28" }, { 0x29, "\x29" }, { 0x2A, "\x2A" },
-            { 0x2B, "\x2B" }, { 0x2C, "\x2C" }, { 0x2D, "\x2D" }, { 0x2E, "\x2E" }, { 0x2F, "\x2F" }, { 0x30, "\x30" }, { 0x31, "\x31" }, { 0x32, "\x32" }, { 0x33, "\x33" }, { 0x34, "\x34" },
-            { 0x35, "\x35" }, { 0x36, "\x36" }, { 0x37, "\x37" }, { 0x38, "\x38" }, { 0x39, "\x39" }, { 0x3A, "\x3A" }, { 0x3B, "\x3B" }, { 0x3C, "\x3C" }, { 0x3D, "\x3D" }, { 0x3E, "\x3E" },
-            { 0x3F, "\x3F" }, { 0x40, "\x40" }, { 0x41, "\x41" }, { 0x42, "\x42" }, { 0x43, "\x43" }, { 0x44, "\x44" }, { 0x45, "\x45" }, { 0x46, "\x46" }, { 0x47, "\x47" }, { 0x48, "\x48" },
-            { 0x49, "\x49" }, { 0x4A, "\x4A" }, { 0x4B, "\x4B" }, { 0x4C, "\x4C" }, { 0x4D, "\x4D" }, { 0x4E, "\x4E" }, { 0x4F, "\x4F" }, { 0x50, "\x50" }, { 0x51, "\x51" }, { 0x52, "\x52" },
-            { 0x53, "\x53" }, { 0x54, "\x54" }, { 0x55, "\x55" }, { 0x56, "\x56" }, { 0x57, "\x57" }, { 0x58, "\x58" }, { 0x59, "\x59" }, { 0x5A, "\x5A" }, { 0x5B, "\x5B" }, { 0x5C, "\x5C" },
-            { 0x5D, "\x5D" }, { 0x5E, "\x5E" }, { 0x5F, "\x5F" }, { 0x60, "\x60" }, { 0x61, "\x61" }, { 0x62, "\x62" }, { 0x63, "\x63" }, { 0x64, "\x64" }, { 0x65, "\x65" }, { 0x66, "\x66" },
-            { 0x67, "\x67" }, { 0x68, "\x68" }, { 0x69, "\x69" }, { 0x6A, "\x6A" }, { 0x6B, "\x6B" }, { 0x6C, "\x6C" }, { 0x6D, "\x6D" }, { 0x6E, "\x6E" }, { 0x6F, "\x6F" }, { 0x70, "\x70" },
-            { 0x71, "\x71" }, { 0x72, "\x72" }, { 0x73, "\x73" }, { 0x74, "\x74" }, { 0x75, "\x75" }, { 0x76, "\x76" }, { 0x77, "\x77" }, { 0x78, "\x78" }, { 0x79, "\x79" }, { 0x7A, "\x7A" },
-            { 0x7B, "\x7B" }, { 0x7C, "\x7C" }, { 0x7D, "\x7D" }, { 0x7E, "\x7E" }, { 0xA1, "\xC2\xA1" }, { 0xA2, "\xC2\xA2" }, { 0xA3, "\xC2\xA3" }, { 0xA4, "\xC2\xA4" }, { 0xA5, "\xC2\xA5" },
-            { 0xA6, "\xC2\xA6" }, { 0xA7, "\xC2\xA7" }, { 0xA8, "\xC2\xA8" }, { 0xA9, "\xC2\xA9" }, { 0xAA, "\xC2\xAA" }, { 0xAB, "\xC2\xAB" }, { 0xAC, "\xC2\xAC" }, { 0xAE, "\xC2\xAE" },
-            { 0xAF, "\xC2\xAF" }, { 0xB0, "\xC2\xB0" }, { 0xB1, "\xC2\xB1" }, { 0xB2, "\xC2\xB2" }, { 0xB3, "\xC2\xB3" }, { 0xB4, "\xC2\xB4" }, { 0xB5, "\xC2\xB5" }, { 0xB6, "\xC2\xB6" },
-            { 0xB7, "\xC2\xB7" }, { 0xB8, "\xC2\xB8" }, { 0xB9, "\xC2\xB9" }, { 0xBA, "\xC2\xBA" }, { 0xBB, "\xC2\xBB" }, { 0xBC, "\xC2\xBC" }, { 0xBD, "\xC2\xBD" }, { 0xBE, "\xC2\xBE" },
-            { 0xBF, "\xC2\xBF" }, { 0xC0, "\xC3\x80" }, { 0xC1, "\xC3\x81" }, { 0xC2, "\xC3\x82" }, { 0xC3, "\xC3\x83" }, { 0xC4, "\xC3\x84" }, { 0xC5, "\xC3\x85" }, { 0xC6, "\xC3\x86" },
-            { 0xC7, "\xC3\x87" }, { 0xC8, "\xC3\x88" }, { 0xC9, "\xC3\x89" }, { 0xCA, "\xC3\x8A" }, { 0xCB, "\xC3\x8B" }, { 0xCC, "\xC3\x8C" }, { 0xCD, "\xC3\x8D" }, { 0xCE, "\xC3\x8E" },
-            { 0xCF, "\xC3\x8F" }, { 0xD0, "\xC3\x90" }, { 0xD1, "\xC3\x91" }, { 0xD2, "\xC3\x92" }, { 0xD3, "\xC3\x93" }, { 0xD4, "\xC3\x94" }, { 0xD5, "\xC3\x95" }, { 0xD6, "\xC3\x96" },
-            { 0xD7, "\xC3\x97" }, { 0xD8, "\xC3\x98" }, { 0xD9, "\xC3\x99" }, { 0xDA, "\xC3\x9A" }, { 0xDB, "\xC3\x9B" }, { 0xDC, "\xC3\x9C" }, { 0xDD, "\xC3\x9D" }, { 0xDE, "\xC3\x9E" },
-            { 0xDF, "\xC3\x9F" }, { 0xE0, "\xC3\xA0" }, { 0xE1, "\xC3\xA1" }, { 0xE2, "\xC3\xA2" }, { 0xE3, "\xC3\xA3" }, { 0xE4, "\xC3\xA4" }, { 0xE5, "\xC3\xA5" }, { 0xE6, "\xC3\xA6" },
-            { 0xE7, "\xC3\xA7" }, { 0xE8, "\xC3\xA8" }, { 0xE9, "\xC3\xA9" }, { 0xEA, "\xC3\xAA" }, { 0xEB, "\xC3\xAB" }, { 0xEC, "\xC3\xAC" }, { 0xED, "\xC3\xAD" }, { 0xEE, "\xC3\xAE" },
-            { 0xEF, "\xC3\xAF" }, { 0xF0, "\xC3\xB0" }, { 0xF1, "\xC3\xB1" }, { 0xF2, "\xC3\xB2" }, { 0xF3, "\xC3\xB3" }, { 0xF4, "\xC3\xB4" }, { 0xF5, "\xC3\xB5" }, { 0xF6, "\xC3\xB6" },
-            { 0xF7, "\xC3\xB7" }, { 0xF8, "\xC3\xB8" }, { 0xF9, "\xC3\xB9" }, { 0xFA, "\xC3\xBA" }, { 0xFB, "\xC3\xBB" }, { 0xFC, "\xC3\xBC" }, { 0xFD, "\xC3\xBD" }, { 0xFE, "\xC3\xBE" },
-            { 0xFF, "\xC3\xBF" }, { 0x00, "\xC4\x80" }, { 0x01, "\xC4\x81" }, { 0x02, "\xC4\x82" }, { 0x03, "\xC4\x83" }, { 0x04, "\xC4\x84" }, { 0x05, "\xC4\x85" }, { 0x06, "\xC4\x86" },
-            { 0x07, "\xC4\x87" }, { 0x08, "\xC4\x88" }, { 0x09, "\xC4\x89" }, { 0x0A, "\xC4\x8A" }, { 0x0B, "\xC4\x8B" }, { 0x0C, "\xC4\x8C" }, { 0x0D, "\xC4\x8D" }, { 0x0E, "\xC4\x8E" },
-            { 0x0F, "\xC4\x8F" }, { 0x10, "\xC4\x90" }, { 0x11, "\xC4\x91" }, { 0x12, "\xC4\x92" }, { 0x13, "\xC4\x93" }, { 0x14, "\xC4\x94" }, { 0x15, "\xC4\x95" }, { 0x16, "\xC4\x96" },
-            { 0x17, "\xC4\x97" }, { 0x18, "\xC4\x98" }, { 0x19, "\xC4\x99" }, { 0x1A, "\xC4\x9A" }, { 0x1B, "\xC4\x9B" }, { 0x1C, "\xC4\x9C" }, { 0x1D, "\xC4\x9D" }, { 0x1E, "\xC4\x9E" },
-            { 0x1F, "\xC4\x9F" }, { 0x20, "\xC4\xA0" }, { 0x7F, "\xC4\xA1" }, { 0x80, "\xC4\xA2" }, { 0x81, "\xC4\xA3" }, { 0x82, "\xC4\xA4" }, { 0x83, "\xC4\xA5" }, { 0x84, "\xC4\xA6" },
-            { 0x85, "\xC4\xA7" }, { 0x86, "\xC4\xA8" }, { 0x87, "\xC4\xA9" }, { 0x88, "\xC4\xAA" }, { 0x89, "\xC4\xAB" }, { 0x8A, "\xC4\xAC" }, { 0x8B, "\xC4\xAD" }, { 0x8C, "\xC4\xAE" },
-            { 0x8D, "\xC4\xAF" }, { 0x8E, "\xC4\xB0" }, { 0x8F, "\xC4\xB1" }, { 0x90, "\xC4\xB2" }, { 0x91, "\xC4\xB3" }, { 0x92, "\xC4\xB4" }, { 0x93, "\xC4\xB5" }, { 0x94, "\xC4\xB6" },
-            { 0x95, "\xC4\xB7" }, { 0x96, "\xC4\xB8" }, { 0x97, "\xC4\xB9" }, { 0x98, "\xC4\xBA" }, { 0x99, "\xC4\xBB" }, { 0x9A, "\xC4\xBC" }, { 0x9B, "\xC4\xBD" }, { 0x9C, "\xC4\xBE" },
-            { 0x9D, "\xC4\xBF" }, { 0x9E, "\xC5\x80" }, { 0x9F, "\xC5\x81" }, { 0xA0, "\xC5\x82" }, { 0xAD, "\xC5\x83" }
-        };
-        return hex_map;
-    }
-
-    std::unordered_map<std::string, unsigned char> unicode_to_bytes() {
-        static std::unordered_map<std::string, unsigned char> hex_map = {
-            { "\x21", 0x21 }, { "\x22", 0x22 }, { "\x23", 0x23 }, { "\x24", 0x24 }, { "\x25", 0x25 }, { "\x26", 0x26 }, { "\x27", 0x27 }, { "\x28", 0x28 }, { "\x29", 0x29 }, { "\x2A", 0x2A },
-            { "\x2B", 0x2B }, { "\x2C", 0x2C }, { "\x2D", 0x2D }, { "\x2E", 0x2E }, { "\x2F", 0x2F }, { "\x30", 0x30 }, { "\x31", 0x31 }, { "\x32", 0x32 }, { "\x33", 0x33 }, { "\x34", 0x34 },
-            { "\x35", 0x35 }, { "\x36", 0x36 }, { "\x37", 0x37 }, { "\x38", 0x38 }, { "\x39", 0x39 }, { "\x3A", 0x3A }, { "\x3B", 0x3B }, { "\x3C", 0x3C }, { "\x3D", 0x3D }, { "\x3E", 0x3E },
-            { "\x3F", 0x3F }, { "\x40", 0x40 }, { "\x41", 0x41 }, { "\x42", 0x42 }, { "\x43", 0x43 }, { "\x44", 0x44 }, { "\x45", 0x45 }, { "\x46", 0x46 }, { "\x47", 0x47 }, { "\x48", 0x48 },
-            { "\x49", 0x49 }, { "\x4A", 0x4A }, { "\x4B", 0x4B }, { "\x4C", 0x4C }, { "\x4D", 0x4D }, { "\x4E", 0x4E }, { "\x4F", 0x4F }, { "\x50", 0x50 }, { "\x51", 0x51 }, { "\x52", 0x52 },
-            { "\x53", 0x53 }, { "\x54", 0x54 }, { "\x55", 0x55 }, { "\x56", 0x56 }, { "\x57", 0x57 }, { "\x58", 0x58 }, { "\x59", 0x59 }, { "\x5A", 0x5A }, { "\x5B", 0x5B }, { "\x5C", 0x5C },
-            { "\x5D", 0x5D }, { "\x5E", 0x5E }, { "\x5F", 0x5F }, { "\x60", 0x60 }, { "\x61", 0x61 }, { "\x62", 0x62 }, { "\x63", 0x63 }, { "\x64", 0x64 }, { "\x65", 0x65 }, { "\x66", 0x66 },
-            { "\x67", 0x67 }, { "\x68", 0x68 }, { "\x69", 0x69 }, { "\x6A", 0x6A }, { "\x6B", 0x6B }, { "\x6C", 0x6C }, { "\x6D", 0x6D }, { "\x6E", 0x6E }, { "\x6F", 0x6F }, { "\x70", 0x70 },
-            { "\x71", 0x71 }, { "\x72", 0x72 }, { "\x73", 0x73 }, { "\x74", 0x74 }, { "\x75", 0x75 }, { "\x76", 0x76 }, { "\x77", 0x77 }, { "\x78", 0x78 }, { "\x79", 0x79 }, { "\x7A", 0x7A },
-            { "\x7B", 0x7B }, { "\x7C", 0x7C }, { "\x7D", 0x7D }, { "\x7E", 0x7E }, { "\xC2\xA1", 0xA1 }, { "\xC2\xA2", 0xA2 }, { "\xC2\xA3", 0xA3 }, { "\xC2\xA4", 0xA4 }, { "\xC2\xA5", 0xA5 },
-            { "\xC2\xA6", 0xA6 }, { "\xC2\xA7", 0xA7 }, { "\xC2\xA8", 0xA8 }, { "\xC2\xA9", 0xA9 }, { "\xC2\xAA", 0xAA }, { "\xC2\xAB", 0xAB }, { "\xC2\xAC", 0xAC }, { "\xC2\xAE", 0xAE },
-            { "\xC2\xAF", 0xAF }, { "\xC2\xB0", 0xB0 }, { "\xC2\xB1", 0xB1 }, { "\xC2\xB2", 0xB2 }, { "\xC2\xB3", 0xB3 }, { "\xC2\xB4", 0xB4 }, { "\xC2\xB5", 0xB5 }, { "\xC2\xB6", 0xB6 },
-            { "\xC2\xB7", 0xB7 }, { "\xC2\xB8", 0xB8 }, { "\xC2\xB9", 0xB9 }, { "\xC2\xBA", 0xBA }, { "\xC2\xBB", 0xBB }, { "\xC2\xBC", 0xBC }, { "\xC2\xBD", 0xBD }, { "\xC2\xBE", 0xBE },
-            { "\xC2\xBF", 0xBF }, { "\xC3\x80", 0xC0 }, { "\xC3\x81", 0xC1 }, { "\xC3\x82", 0xC2 }, { "\xC3\x83", 0xC3 }, { "\xC3\x84", 0xC4 }, { "\xC3\x85", 0xC5 }, { "\xC3\x86", 0xC6 },
-            { "\xC3\x87", 0xC7 }, { "\xC3\x88", 0xC8 }, { "\xC3\x89", 0xC9 }, { "\xC3\x8A", 0xCA }, { "\xC3\x8B", 0xCB }, { "\xC3\x8C", 0xCC }, { "\xC3\x8D", 0xCD }, { "\xC3\x8E", 0xCE },
-            { "\xC3\x8F", 0xCF }, { "\xC3\x90", 0xD0 }, { "\xC3\x91", 0xD1 }, { "\xC3\x92", 0xD2 }, { "\xC3\x93", 0xD3 }, { "\xC3\x94", 0xD4 }, { "\xC3\x95", 0xD5 }, { "\xC3\x96", 0xD6 },
-            { "\xC3\x97", 0xD7 }, { "\xC3\x98", 0xD8 }, { "\xC3\x99", 0xD9 }, { "\xC3\x9A", 0xDA }, { "\xC3\x9B", 0xDB }, { "\xC3\x9C", 0xDC }, { "\xC3\x9D", 0xDD }, { "\xC3\x9E", 0xDE },
-            { "\xC3\x9F", 0xDF }, { "\xC3\xA0", 0xE0 }, { "\xC3\xA1", 0xE1 }, { "\xC3\xA2", 0xE2 }, { "\xC3\xA3", 0xE3 }, { "\xC3\xA4", 0xE4 }, { "\xC3\xA5", 0xE5 }, { "\xC3\xA6", 0xE6 },
-            { "\xC3\xA7", 0xE7 }, { "\xC3\xA8", 0xE8 }, { "\xC3\xA9", 0xE9 }, { "\xC3\xAA", 0xEA }, { "\xC3\xAB", 0xEB }, { "\xC3\xAC", 0xEC }, { "\xC3\xAD", 0xED }, { "\xC3\xAE", 0xEE },
-            { "\xC3\xAF", 0xEF }, { "\xC3\xB0", 0xF0 }, { "\xC3\xB1", 0xF1 }, { "\xC3\xB2", 0xF2 }, { "\xC3\xB3", 0xF3 }, { "\xC3\xB4", 0xF4 }, { "\xC3\xB5", 0xF5 }, { "\xC3\xB6", 0xF6 },
-            { "\xC3\xB7", 0xF7 }, { "\xC3\xB8", 0xF8 }, { "\xC3\xB9", 0xF9 }, { "\xC3\xBA", 0xFA }, { "\xC3\xBB", 0xFB }, { "\xC3\xBC", 0xFC }, { "\xC3\xBD", 0xFD }, { "\xC3\xBE", 0xFE },
-            { "\xC3\xBF", 0xFF }, { "\xC4\x80", 0x00 }, { "\xC4\x81", 0x01 }, { "\xC4\x82", 0x02 }, { "\xC4\x83", 0x03 }, { "\xC4\x84", 0x04 }, { "\xC4\x85", 0x05 }, { "\xC4\x86", 0x06 },
-            { "\xC4\x87", 0x07 }, { "\xC4\x88", 0x08 }, { "\xC4\x89", 0x09 }, { "\xC4\x8A", 0x0A }, { "\xC4\x8B", 0x0B }, { "\xC4\x8C", 0x0C }, { "\xC4\x8D", 0x0D }, { "\xC4\x8E", 0x0E },
-            { "\xC4\x8F", 0x0F }, { "\xC4\x90", 0x10 }, { "\xC4\x91", 0x11 }, { "\xC4\x92", 0x12 }, { "\xC4\x93", 0x13 }, { "\xC4\x94", 0x14 }, { "\xC4\x95", 0x15 }, { "\xC4\x96", 0x16 },
-            { "\xC4\x97", 0x17 }, { "\xC4\x98", 0x18 }, { "\xC4\x99", 0x19 }, { "\xC4\x9A", 0x1A }, { "\xC4\x9B", 0x1B }, { "\xC4\x9C", 0x1C }, { "\xC4\x9D", 0x1D }, { "\xC4\x9E", 0x1E },
-            { "\xC4\x9F", 0x1F }, { "\xC4\xA0", 0x20 }, { "\xC4\xA1", 0x7F }, { "\xC4\xA2", 0x80 }, { "\xC4\xA3", 0x81 }, { "\xC4\xA4", 0x82 }, { "\xC4\xA5", 0x83 }, { "\xC4\xA6", 0x84 },
-            { "\xC4\xA7", 0x85 }, { "\xC4\xA8", 0x86 }, { "\xC4\xA9", 0x87 }, { "\xC4\xAA", 0x88 }, { "\xC4\xAB", 0x89 }, { "\xC4\xAC", 0x8A }, { "\xC4\xAD", 0x8B }, { "\xC4\xAE", 0x8C },
-            { "\xC4\xAF", 0x8D }, { "\xC4\xB0", 0x8E }, { "\xC4\xB1", 0x8F }, { "\xC4\xB2", 0x90 }, { "\xC4\xB3", 0x91 }, { "\xC4\xB4", 0x92 }, { "\xC4\xB5", 0x93 }, { "\xC4\xB6", 0x94 },
-            { "\xC4\xB7", 0x95 }, { "\xC4\xB8", 0x96 }, { "\xC4\xB9", 0x97 }, { "\xC4\xBA", 0x98 }, { "\xC4\xBB", 0x99 }, { "\xC4\xBC", 0x9A }, { "\xC4\xBD", 0x9B }, { "\xC4\xBE", 0x9C },
-            { "\xC4\xBF", 0x9D }, { "\xC5\x80", 0x9E }, { "\xC5\x81", 0x9F }, { "\xC5\x82", 0xA0 }, { "\xC5\x83", 0xAD }
-        };
-        return hex_map;
-    }
-
-    // len must be available
-    bool inline str_is_equal(const char* str1, const char* str2, size_t len) {
-        for (size_t i = 0; i < len; ++i) {
-            if (str1[i] != str2[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    std::vector<std::string> bpe_gpt2_preprocess(const std::string& text) {
-        static std::unordered_map< unsigned char, std::string> byte_encoder = bytes_to_unicode();
-        std::vector<std::string> bpe_words;
-        std::vector<std::string> bpe_encoded_words;
-
-        std::string token="";
-        const char *raw_text_p = text.c_str();
-        // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
-        bool collecting_numeric = false;
-        bool collecting_letter = false;
-        bool collecting_special = false;
-        bool collecting_whitespace_lookahead = false;
-        bool collecting=false;
-
-        std::vector<CNCTString> text_utf;
-        text_utf.reserve(text.size());
-        bpe_words.reserve(text.size());
-        bpe_encoded_words.reserve(text.size());
-
-        text_utf = CNCTUnicode::split_utf8_enhanced(text);
-
-        for (int i = 0; i < (int)text_utf.size(); i++) {
-            const CNCTString &utf_char = text_utf[i];
-            bool split_condition = false;
-            const char *text_pos = raw_text_p + utf_char.seq_offset_bytes;
-            int bytes_remain = strlen(text_pos);
-            // forward backward lookups
-            const CNCTString &utf_char_next = (i+1 < (int)text_utf.size()) ? text_utf[i+1] : CNCTString();
-            const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString();
-            // const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString();
-
-            // handling contractions
-            if (!split_condition && bytes_remain >= 2) {
-                // 's|'t|'m|'d
-                if (utf_char == '\'' && (utf_char_next == 's' || utf_char_next == 't' || utf_char_next == 'm' || utf_char_next == 'd')) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    if (token.size()) {
-                        bpe_words.emplace_back(token); // push previous content as token
-                    }
-                    token = utf_char.str + utf_char_next.str;
-                    bpe_words.emplace_back(token);
-                    token="";
-                    i++;
-                    continue;
-                }
-            }
-            if (!split_condition && bytes_remain >= 3) {
-                // 're|'ve|'ll
-                if (utf_char == '\'' && (
-                                          (utf_char_next == 'r' || utf_char_next_next == 'e') ||
-                                          (utf_char_next == 'v' || utf_char_next_next == 'e') ||
-                                          (utf_char_next == 'l' || utf_char_next_next == 'l'))
-                                        ) {
-                    split_condition = true;
-                }
-                if (split_condition) {
-                    // current token + next token can be defined
-                    if (token.size()) {
-                        bpe_words.emplace_back(token); // push previous content as token
-                    }
-                    token = utf_char.str + utf_char_next.str + utf_char_next_next.str;
-                    bpe_words.emplace_back(token); // the contraction
-                    token="";
-                    i+=2;
-                    continue;
-                }
-            }
-
-            if (!split_condition && !collecting) {
-                if (utf_char.char_type == CNCTCharType::LETTER || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::LETTER)) {
-                    collecting_letter = true;
-                    collecting = true;
-                } else if (utf_char.char_type == CNCTCharType::DIGIT || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::DIGIT)) {
-                    collecting_numeric = true;
-                    collecting = true;
-                } else if (
-                           ((utf_char.char_type != CNCTCharType::LETTER && utf_char.char_type != CNCTCharType::DIGIT) && (utf_char.char_type != CNCTCharType::WHITESPACE)) ||
-                           (!token.size() && utf_char==" " && utf_char_next.char_type != CNCTCharType::LETTER && utf_char_next.char_type != CNCTCharType::DIGIT && utf_char_next.char_type != CNCTCharType::WHITESPACE)
-                          ) {
-                    collecting_special = true;
-                    collecting = true;
-                } else if (utf_char.char_type == CNCTCharType::WHITESPACE && utf_char_next.char_type == CNCTCharType::WHITESPACE) {
-                    collecting_whitespace_lookahead = true;
-                    collecting = true;
-                } else if (utf_char.char_type == CNCTCharType::WHITESPACE) {
-                    split_condition = true;
-                }
-            } else if (!split_condition && collecting) {
-                if (collecting_letter && utf_char.char_type != CNCTCharType::LETTER) {
-                    split_condition = true;
-                } else if (collecting_numeric && utf_char.char_type != CNCTCharType::DIGIT) {
-                    split_condition = true;
-                } else if (collecting_special && (utf_char.char_type == CNCTCharType::LETTER || utf_char.char_type == CNCTCharType::DIGIT || utf_char.char_type == CNCTCharType::WHITESPACE)) {
-                    split_condition = true;
-                } else if (collecting_whitespace_lookahead && utf_char_next.char_type != CNCTCharType::WHITESPACE) {
-                    split_condition = true;
-                }
-            }
-
-            if(utf_char_next.str.size() == 0) {
-                split_condition = true; // final
-                token += utf_char.str;
-            }
-
-            if (split_condition) {
-                if (token.size()) {
-                    bpe_words.emplace_back(token);
-                }
-                token = utf_char.str;
-                collecting = false;
-                collecting_letter = false;
-                collecting_numeric = false;
-                collecting_special = false;
-                collecting_whitespace_lookahead = false;
-            } else {
-                token += utf_char.str;
-            }
-        }
-
-        for (std::string& word : bpe_words) {
-            std::string encoded_token="";
-            for (char& c : word) {
-                encoded_token += byte_encoder[c];
-            }
-            bpe_encoded_words.emplace_back(encoded_token);
-        }
-
-        return bpe_encoded_words;
-    }
-
-    // decoder (for one token)
-    std::string decode_token(const std::string& token) {
-        static std::unordered_map< std::string, unsigned char> byte_decoder = unicode_to_bytes();
-        std::string decoded_token="";
-        auto unicode_seqeunces = CNCTUnicode::split_utf8(token);
-        for (auto& unicode_sequence : unicode_seqeunces) {
-            decoded_token += byte_decoder[unicode_sequence];
-        }
-
-        return decoded_token;
-    }
-
-    const gpt2bpe_vocab & vocab_;
-    std::vector<ggllm_bpe_symbol> symbols_;
-    std::vector<ggllm_bpe_symbol> symbols_final;
-    ggllm_bpe_bigram::queue work_queue_;
-    bool flag_g2ws=false;
-};
-
-static std::vector<gpt2bpe_vocab::id> gpt2bpe_tokenize(const gpt2bpe_vocab & vocab, const std::string & text, bool bos, bool g2ws ) {
-    gpt2bpe_tokenizer tokenizer(vocab, g2ws);
-    std::vector<gpt2bpe_vocab::id> output;
-
-    if (text.empty()) {
-        return output;
-    }
-
-    if (bos && vocab.special_bos_id != -1) {
-        output.push_back(vocab.special_bos_id);
-    }
-
-    tokenizer.tokenize(text, output);
-    return output;
-}
-
-#endif // CMPNCT_GPT2BPE
diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp
deleted file mode 100644
index e9197f6b5..000000000
--- a/examples/gptneox-wip/falcon-main.cpp
+++ /dev/null
@@ -1,1111 +0,0 @@
-#include "ggml.h"
-#include "cmpnct_gpt2bpe.hpp"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <thread>
-#include <random>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams
-struct falcon_hparams {
-    size_t n_merges = 0;
-    size_t n_vocab  = 0;
-    uint32_t n_ctx    = 0;
-    uint32_t n_embd   = 0;
-    uint32_t n_head   = 0;
-    uint32_t n_head_kv = 1; // Needs to be 1 for 7B model
-    uint32_t n_ff = 0;
-    uint32_t n_block  = 0;
-    float norm_eps = 1e-5;
-};
-struct falcon_block {
-    // normalization
-    struct ggml_tensor* input_layernorm;
-    struct ggml_tensor* input_layernorm_b;
-    struct ggml_tensor* attention_norm;    // Falcon-40B only
-    struct ggml_tensor* attention_norm_b;  // Falcon-40B only
-
-    // attention
-    struct ggml_tensor* query_key_value;
-    struct ggml_tensor* wo;
-
-    // ff
-    struct ggml_tensor* ffn_up;
-    struct ggml_tensor* ffn_down;
-};
-
-struct falcon_model {
-    falcon_hparams hparams;
-
-    struct ggml_tensor* tok_embeddings;
-    struct ggml_tensor* output_norm;
-    struct ggml_tensor* output_norm_b;
-    struct ggml_tensor* lm_head;
-
-    std::vector<falcon_block> blocks;
-
-    // key + value memory
-    struct ggml_tensor* memory_k;
-    struct ggml_tensor* memory_v;
-
-    struct gguf_context * ggufctx;
-    struct ggml_context * ctx;
-    struct ggml_context * kvctx;
-
-    std::map<std::string, struct ggml_tensor*> tensors;
-};
-
-struct gpt_params {
-    int32_t seed      = -1;  // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    uint32_t n_predict = 200; // new tokens to predict
-    uint32_t n_batch   = 512;   // batch size for prompt processing
-
-    // sampling parameters
-    int32_t top_k          = 40;
-    float top_p            = 1.0f;
-    float temp             = 0.8f;
-    int32_t repeat_last_n  = 64;
-    float repeat_penalty   = 1.02f;
-
-    std::string model      = ""; // model path
-    std::string prompt     = "";
-
-    std::string token_test = "";
-    bool    interactive      = false;
-    int32_t interactive_port = -1;
-    int32_t n_gpu_layers     = 0;
-};
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        load prompt from a file\n");
-    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
-    fprintf(stderr, "                        test tokenization\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-// Function to check if the next argument exists
-std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
-    if (i + 1 < argc && argv[i + 1][0] != '-') {
-        return argv[++i];
-    } else {
-        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
-        gpt_print_usage(argc, argv, params);
-        exit(0);
-    }
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_k") {
-            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_p") {
-            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--temp") {
-            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "-ip" || arg == "--interactive-port") {
-            params.interactive = true;
-            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        } else if (arg == "-f" || arg == "--file") {
-            get_next_arg(i, argc, argv, arg, params);
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                break;
-            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = get_next_arg(i, argc, argv, arg, params);
-        }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-gpt2bpe_vocab::id sample_top_k_top_p_repeat(
-        const gpt2bpe_vocab & vocab,
-        const float * logits,
-        const int32_t * last_n_tokens_data,
-        size_t last_n_tokens_data_size,
-        int    top_k,
-        double top_p,
-        double temp,
-        int repeat_last_n,
-        float repeat_penalty,
-        std::mt19937 & rng) {
-
-    int n_logits = vocab.id_to_token.size();
-
-    const auto * plogits = logits;
-
-    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        gpt2bpe_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
-        }
-        return max_id;
-    }
-
-
-    std::vector<std::pair<double, gpt2bpe_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt2bpe_vocab::id> & a, const std::pair<double, gpt2bpe_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-//    printf("\n");
-//    for (int i = 0; i < (int) probs.size(); i++) {
-//    for (int i = 0; i < 10; i++) {
-//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-//    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-
-}
-
-struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){
-
-    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
-    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
-    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
-    }
-
-    return cur;
-}
-
-// load the model's weights from a file
-bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_vocab & vocab) {
-    printf("%s: loading model from '%s'..\n", __func__, fname.c_str());
-
-    model.ctx = NULL;
-
-    struct gguf_init_params ggufparams = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &model.ctx,
-    };
-
-    auto & ggufctx = model.ggufctx;
-
-    ggufctx  = gguf_init_from_file(fname.c_str(), ggufparams);
-
-    if (!ggufctx) {
-        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
-        return false;
-    }
-
-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
-
-    // print all kv
-    #if 0
-    {
-        const int n_kv = gguf_get_n_kv(ggufctx);
-
-        printf("%s: n_kv: %d\n", __func__, n_kv);
-
-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ggufctx, i);
-
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
-        }
-    }
-    #endif
-
-    // print some standard metadata
-    {
-        int keyidx;
-
-        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-    }
-
-    // check required metadata
-    {
-        int keyidx;
-
-        // check model architecture kv
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
-            return false;
-        }
-
-        // check model tensor data layout kv
-        keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
-                printf("%s: model tensor data layout not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: gguf model tensor data layout not found!\n", __func__);
-            return false;
-        }
-
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        bool ok = true;
-        int keyidx;
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.context_length");
-                  if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.embedding_length");
-                  if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count");
-                  if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.feed_forward_length");
-                  if (keyidx != -1) { hparams.n_ff = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.block_count");
-                  if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.layer_norm_epsilon");
-                  if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (!ok) {
-            fprintf(stderr, "%s: required hparam missing!\n", __func__);
-            return false;
-        }
-
-        keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count_kv");
-        if (keyidx != -1) { hparams.n_head_kv = gguf_get_val_u32(ggufctx, keyidx); }
-
-
-        printf("%s: n_ctx      = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd     = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head     = %d\n", __func__, hparams.n_head);
-        printf("%s: n_head_kv  = %d\n", __func__, hparams.n_head_kv);
-        printf("%s: n_block    = %d\n", __func__, hparams.n_block);
-        printf("%s: norm_eps   = %g\n", __func__, hparams.norm_eps);
-
-    }
-
-    // load vocab
-    {
-        auto & hparams = model.hparams;
-
-        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
-
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
-            return false;
-        }
-
-
-        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
-
-        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
-            return false;
-        }
-
-        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
-
-        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
-            return false;
-        }
-
-        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
-        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
-
-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
-
-        for (size_t i = 0; i < hparams.n_vocab; i++) {
-            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
-
-//            printf("token %d = '%s'\n",i,word.c_str() );
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            if( vocab.id_to_token[i] == "\n" ) {
-                vocab.linefeed_id = i;
-            }
-        }
-
-        std::vector<std::pair<std::string, std::string>> bpe_merges;
-
-        for (size_t i = 0; i < hparams.n_merges; i++) {
-
-            std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i);
-
-            // Split the merges
-            std::string first, second;
-            size_t pos = word.find(' ', 1); // Start the search from the second character
-            if (pos != std::string::npos) {
-                first = word.substr(0, pos);
-                second = word.substr(pos + 1);
-            }
-
-            bpe_merges.push_back(std::make_pair(first, second));
-        }
-
-        vocab.populate_bpe_ranks(bpe_merges);
-
-
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) {       vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) {       vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) {   vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-
-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
-
-    }
-
-
-    auto & ctx = model.ctx;
-    size_t ctx_size = ggml_get_mem_size(ctx);
-
-    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-
-    // print tensor info
-    #if 0
-    {
-        const int n_tensors = gguf_get_n_tensors(ggufctx);
-
-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
-
-        for (int i = 0; i < n_tensors; ++i) {
-            const char * name   = gguf_get_tensor_name  (ggufctx, i);
-            const size_t offset = gguf_get_tensor_offset(ggufctx, i);
-
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
-        }
-    }
-    #endif
-
-    // prepare memory for the weights
-    {
-
-        auto & hparams = model.hparams;
-
-        const int n_block = hparams.n_block;
-
-        model.blocks.resize(n_block);
-
-        model.tok_embeddings = ggml_get_tensor(ctx, "token_embd.weight");
-
-        model.output_norm = ggml_get_tensor(ctx, "output_norm.weight");
-        model.output_norm_b = ggml_get_tensor(ctx, "output_norm.bias");
-        model.lm_head = ggml_get_tensor(ctx, "output.weight");
-
-        // map by name
-        model.tensors["token_embd.weight"] = model.tok_embeddings;
-        model.tensors["output_norm.weight"] = model.output_norm;
-        model.tensors["output_norm.bias"] = model.output_norm_b;
-        model.tensors["output.weight"] = model.lm_head;
-
-        for (int i = 0; i < n_block; ++i) {
-
-            auto& block = model.blocks[i];
-            std::string blocknamestart = "blk." + std::to_string(i) + ".";
-
-            block.input_layernorm   =  get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
-            block.input_layernorm_b =  get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
-
-            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
-                block.attention_norm   =  get_tensor_ex(ctx, blocknamestart + "attn_norm_2.weight" );
-                block.attention_norm_b =  get_tensor_ex(ctx, blocknamestart + "attn_norm_2.bias" );
-            }
-
-            // query_key_value shape for config.multi_query == True:
-            block.query_key_value = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
-            block.wo = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
-
-            block.ffn_up = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
-            block.ffn_down = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
-
-            // map by name
-            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
-                // Falcon-40B:
-                model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm;
-                model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b;
-                model.tensors[blocknamestart + "attn_norm_2.weight"] = block.attention_norm;
-                model.tensors[blocknamestart + "attn_norm_2.bias"] = block.attention_norm_b;
-            } else {
-                // Falcon-7B:
-                model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm;
-                model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b;
-            }
-
-            model.tensors[blocknamestart + "attn_qkv.weight"] = block.query_key_value;
-            model.tensors[blocknamestart + "attn_output.weight"] = block.wo;
-
-            model.tensors[blocknamestart + "ffn_up.weight"] = block.ffn_up;
-            model.tensors[blocknamestart + "ffn_down.weight"] = block.ffn_down;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & kvctx = model.kvctx;
-        const auto & hparams = model.hparams;
-
-        const int n_block = hparams.n_block;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_embd = hparams.n_embd;
-
-        const int64_t n_mem      = n_block*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        // create the ggml context
-        {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ size_t(n_elements*4+ggml_tensor_overhead()*2),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ false,
-            };
-
-            model.kvctx = ggml_init(params);
-            if (!model.kvctx) {
-                fprintf(stderr, "%s: kv ggml_init() failed\n", __func__);
-                return false;
-            }
-
-        }
-
-
-        model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    return true;
-}
-
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool falcon_eval(
-        const falcon_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt2bpe_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-
-
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_block = hparams.n_block;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_head_kv = hparams.n_head_kv;
-    const int n_vocab = hparams.n_vocab;
-    const size_t head_dim = n_embd / n_head;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-//    gf.n_threads = n_threads;
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
-//    struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head);
-
-    ggml_type wtype = GGML_TYPE_F32;
-    const int sizeof_wtype = ggml_type_sizef(wtype);
-
-    for (int il = 0; il < n_block; ++il) {
-        struct ggml_tensor * cur;
-        struct ggml_tensor * layernorm_output;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // self-attention
-        {
-            layernorm_output = ggml_norm(ctx0, inpL);
-
-            layernorm_output = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.blocks[il].input_layernorm, layernorm_output),
-                        layernorm_output),
-                    ggml_repeat(ctx0, model.blocks[il].input_layernorm_b, layernorm_output));
-
-            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
-                cur = ggml_norm(ctx0, inpL);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.blocks[il].attention_norm, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.blocks[il].attention_norm_b, cur));
-            }
-            else { // Falcon 7B
-                cur = layernorm_output;
-            }
-
-            // compute QKV
-
-            cur = ggml_mul_mat(ctx0, model.blocks[il].query_key_value, cur);
-
-            // Note that the strides for Kcur, Vcur are set up so that the
-            // resulting views are misaligned with the tensor's storage
-            // (by applying the K/V offset we shift the tensor's original
-            // view to stick out behind the viewed QKV tensor's allocated
-            // memory, so to say). This is ok because no actual accesses
-            // happen to that out-of-range memory, but it can require some
-            // trickery when trying to accurately dump these views for
-            // debugging.
-
-            struct ggml_tensor * Qcur = ggml_view_3d(
-                ctx0, cur, head_dim, n_head, N,
-                head_dim * sizeof_wtype,
-                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
-                0);
-
-            struct ggml_tensor * Kcur = ggml_view_3d(
-                ctx0, cur, head_dim, n_head_kv, N,
-                head_dim * sizeof_wtype,
-                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
-                head_dim * n_head * sizeof_wtype);
-
-            struct ggml_tensor * Vcur = ggml_view_3d(
-                ctx0, cur, head_dim, n_head_kv, N,
-                head_dim * sizeof_wtype,
-                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
-                head_dim * (n_head + n_head_kv) * sizeof_wtype);
-
-            // using mode = 2 for neox mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor* k = ggml_view_1d(
-                    ctx0, model.memory_k, N * n_head_kv * head_dim,
-                    (ggml_element_size(model.memory_k) * n_head_kv * head_dim) *
-                        (il * n_ctx + n_past));
-                struct ggml_tensor* v = ggml_view_1d(
-                    ctx0, model.memory_v, N * n_head_kv * head_dim,
-                    (ggml_element_size(model.memory_v) * n_head_kv * head_dim) *
-                        (il * n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            struct ggml_tensor * K = ggml_permute(
-                ctx0,
-                ggml_reshape_3d(
-                    ctx0,
-                    ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim,
-                                 il * n_ctx *
-                                     ggml_element_size(model.memory_k) *
-                                     n_head_kv *
-                                     head_dim),
-                    head_dim, n_head_kv, n_past + N),
-                0, 2, 1, 3);
-
-            // K * Q
-
-//            K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy));
-
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim)))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor* V = ggml_permute(
-                ctx0,
-                ggml_reshape_3d(
-                    ctx0,
-                    ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim,
-                                 il * n_ctx *
-                                     ggml_element_size(model.memory_v) *
-                                     n_head_kv *
-                                     head_dim),
-                    head_dim, n_head_kv, n_past + N),
-                0, 2, 1, 3);
-
-//            V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy)));
-            V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.blocks[il].wo,
-                        cur);
-            }
-        }
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        struct ggml_tensor* inpFF = layernorm_output;
-        struct ggml_tensor* attn_out = ggml_cpy(
-            ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-        {
-            cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_up, inpFF);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_down, cur);
-        }
-
-        cur = ggml_add(ctx0, cur, attn_out);
-        cur = ggml_add(ctx0, cur, inpL);
-        // input for next layer
-        inpL = cur;
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.output_norm, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.output_norm_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-//    ggml_graph_compute       (ctx0, &gf);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt2bpe_vocab vocab;
-    falcon_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!falcon_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.top_k == 0) {
-        params.top_k = model.hparams.n_vocab;
-    }
-
-    printf("%s: seed           = %d\n",   __func__, params.seed);
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    std::mt19937 rng(params.seed);
-
-    if (params.prompt.empty()) {
-        params.prompt = "Once upon";
-    }
-
-    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt2bpe_vocab::id> embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-//    for (size_t i = 0; i < embd_inp.size(); i++) {
-//        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str());
-//    }
-
-    if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) {
-        params.n_predict = model.hparams.n_ctx-embd_inp.size();
-    }
-
-    printf("%s: n_predict = %d\n", __func__, params.n_predict);
-    printf("\n");
-
-    std::vector<gpt2bpe_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    falcon_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!falcon_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-            const int repeat_last_n = params.repeat_last_n;
-            const float repeat_penalty = params.repeat_penalty;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt2bpe_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (embd.size() > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str()  );
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp
deleted file mode 100644
index b76bafaa8..000000000
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ /dev/null
@@ -1,1083 +0,0 @@
-#include "ggml.h"
-#include "cmpnct_gpt2bpe.hpp"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <thread>
-#include <random>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams
-struct gpt_neox_hparams {
-    size_t n_merges = 0;
-    size_t n_vocab  = 0;
-    uint32_t n_ctx    = 0;
-    uint32_t n_embd   = 0;
-    uint32_t n_head   = 0;
-    uint32_t n_block  = 0;
-    uint32_t n_rot    = 0; // rotary_pct * (n_embd / n_head)
-    bool par_res = true;
-    float norm_eps = 1e-5;
-};
-
-struct gpt_neox_block {
-    // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt_neox_model {
-    gpt_neox_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-
-    std::vector<gpt_neox_block> blocks;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct gguf_context * ggufctx;
-    struct ggml_context * ctx;
-    struct ggml_context * kvctx;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct gpt_params {
-    int32_t seed      = -1;  // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    uint32_t n_predict = 200; // new tokens to predict
-    uint32_t n_batch   = 512;   // batch size for prompt processing
-
-    // sampling parameters
-    int32_t top_k          = 40;
-    float top_p            = 1.0f;
-    float temp             = 0.8f;
-    int32_t repeat_last_n  = 64;
-    float repeat_penalty   = 1.02f;
-
-    std::string model      = ""; // model path
-    std::string prompt     = "";
-
-    std::string token_test = "";
-    bool    interactive      = false;
-    int32_t interactive_port = -1;
-    int32_t n_gpu_layers     = 0;
-};
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        load prompt from a file\n");
-    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
-    fprintf(stderr, "                        test tokenization\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-// Function to check if the next argument exists
-std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
-    if (i + 1 < argc && argv[i + 1][0] != '-') {
-        return argv[++i];
-    } else {
-        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
-        gpt_print_usage(argc, argv, params);
-        exit(0);
-    }
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_k") {
-            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_p") {
-            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--temp") {
-            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "-ip" || arg == "--interactive-port") {
-            params.interactive = true;
-            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        } else if (arg == "-f" || arg == "--file") {
-            get_next_arg(i, argc, argv, arg, params);
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                break;
-            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = get_next_arg(i, argc, argv, arg, params);
-        }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-gpt2bpe_vocab::id sample_top_k_top_p_repeat(
-        const gpt2bpe_vocab & vocab,
-        const float * logits,
-        const int32_t * last_n_tokens_data,
-        size_t last_n_tokens_data_size,
-        int    top_k,
-        double top_p,
-        double temp,
-        int repeat_last_n,
-        float repeat_penalty,
-        std::mt19937 & rng) {
-
-    int n_logits = vocab.id_to_token.size();
-
-    const auto * plogits = logits;
-
-    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        gpt2bpe_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
-        }
-        return max_id;
-    }
-
-
-    std::vector<std::pair<double, gpt2bpe_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt2bpe_vocab::id> & a, const std::pair<double, gpt2bpe_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-//    printf("\n");
-//    for (int i = 0; i < (int) probs.size(); i++) {
-//    for (int i = 0; i < 10; i++) {
-//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-//    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-
-}
-
-struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){
-
-    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
-    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
-    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
-    }
-
-    return cur;
-}
-
-// load the model's weights from a file
-bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2bpe_vocab & vocab) {
-    printf("%s: loading model from '%s'..\n", __func__, fname.c_str());
-
-    model.ctx = NULL;
-
-    struct gguf_init_params ggufparams = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &model.ctx,
-    };
-
-    auto & ggufctx = model.ggufctx;
-
-    ggufctx  = gguf_init_from_file(fname.c_str(), ggufparams);
-
-    if (!ggufctx) {
-        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
-        return false;
-    }
-
-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
-
-    // print all kv
-    #if 0
-    {
-        const int n_kv = gguf_get_n_kv(ggufctx);
-
-        printf("%s: n_kv: %d\n", __func__, n_kv);
-
-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ggufctx, i);
-
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
-        }
-    }
-    #endif
-
-    // print some standard metadata
-    {
-        int keyidx;
-
-        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-    }
-
-    // check required metadata
-    {
-        int keyidx;
-
-        // check model architecture kv
-        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
-            return false;
-        }
-
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        bool ok = true;
-        int keyidx;
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.context_length");
-                  if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.embedding_length");
-                  if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.head_count");
-                  if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.block_count");
-                  if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count");
-                  if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.use_parallel_residual");
-                  if (keyidx != -1) { hparams.par_res = gguf_get_val_bool(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.layer_norm_epsilon");
-                  if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; }  }
-
-        if (!ok) {
-            fprintf(stderr, "%s: required hparam missing!\n", __func__);
-            return false;
-        }
-
-        printf("%s: n_ctx    = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd   = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head   = %d\n", __func__, hparams.n_head);
-        printf("%s: n_block  = %d\n", __func__, hparams.n_block);
-        printf("%s: n_rot    = %d\n", __func__, hparams.n_rot);
-        printf("%s: par_res  = %d\n", __func__, hparams.par_res);
-        printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps);
-
-    }
-
-    // load vocab
-    {
-        auto & hparams = model.hparams;
-
-        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
-
-        if (keyidx != -1) {
-            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
-                return false;
-            }
-        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
-            return false;
-        }
-
-
-        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
-
-        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
-            return false;
-        }
-
-        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
-
-        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
-            return false;
-        }
-
-        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
-        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
-
-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
-
-        for (size_t i = 0; i < hparams.n_vocab; i++) {
-            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
-
-//            printf("token %d = '%s'\n",i,word.c_str() );
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            if( vocab.id_to_token[i] == "\n" ) {
-                vocab.linefeed_id = i;
-            }
-        }
-
-        std::vector<std::pair<std::string, std::string>> bpe_merges;
-
-        for (size_t i = 0; i < hparams.n_merges; i++) {
-
-            std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i);
-
-            // Split the merges
-            std::string first, second;
-            size_t pos = word.find(' ', 1); // Start the search from the second character
-            if (pos != std::string::npos) {
-                first = word.substr(0, pos);
-                second = word.substr(pos + 1);
-            }
-
-            bpe_merges.push_back(std::make_pair(first, second));
-        }
-
-        vocab.populate_bpe_ranks(bpe_merges);
-
-
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) {       vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) {       vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) {   vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
-
-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
-    }
-
-
-    auto & ctx = model.ctx;
-    size_t ctx_size = ggml_get_mem_size(ctx);
-
-    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-
-    // print tensor info
-    #if 0
-    {
-        const int n_tensors = gguf_get_n_tensors(ggufctx);
-
-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
-
-        for (int i = 0; i < n_tensors; ++i) {
-            const char * name   = gguf_get_tensor_name  (ggufctx, i);
-            const size_t offset = gguf_get_tensor_offset(ggufctx, i);
-
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
-        }
-    }
-    #endif
-
-    // prepare memory for the weights
-    {
-        const int n_block = model.hparams.n_block;
-
-        model.blocks.resize(n_block);
-
-        model.wte    = ggml_get_tensor(ctx, "token_embd.weight");
-        model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight");
-        model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias");
-        model.lmh_g  = ggml_get_tensor(ctx, "output.weight");
-
-        // map by name
-        model.tensors["token_embd.weight"] = model.wte;
-        model.tensors["output_norm.weight"] = model.ln_f_g;
-        model.tensors["output_norm.bias"]   = model.ln_f_b;
-        model.tensors["output.weight"] = model.lmh_g;
-
-        for (int i = 0; i < n_block; ++i) {
-            auto & block = model.blocks[i];
-
-            std::string blocknamestart = "blk." + std::to_string(i) + ".";
-
-            block.ln_1_g          = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
-            block.ln_1_b          = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
-
-            block.c_attn_attn_w   = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
-            block.c_attn_attn_b   = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" );
-
-            block.c_attn_proj_w   = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
-            block.c_attn_proj_b   = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" );
-
-            block.ln_2_g          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" );
-            block.ln_2_b          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias");
-
-            block.c_mlp_fc_w      = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
-            block.c_mlp_fc_b      = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" );
-
-            block.c_mlp_proj_w    = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
-            block.c_mlp_proj_b    = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" );
-
-            // map by name
-            model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g;
-            model.tensors[blocknamestart + "attn_norm.bias"]   = block.ln_1_b;
-
-            model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w;
-            model.tensors[blocknamestart + "attn_qkv.bias"]   = block.c_attn_attn_b;
-
-            model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w;
-            model.tensors[blocknamestart + "attn_output.bias"]   = block.c_attn_proj_b;
-
-            model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g;
-            model.tensors[blocknamestart + "ffn_norm.bias"]   = block.ln_2_b;
-
-            model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w;
-            model.tensors[blocknamestart + "ffn_up.bias"]   = block.c_mlp_fc_b;
-
-            model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w;
-            model.tensors[blocknamestart + "ffn_down.bias"]   = block.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & kvctx = model.kvctx;
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_block = hparams.n_block;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int64_t n_mem      = n_block*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        // create the ggml context
-        {
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ size_t(n_elements*4+ggml_tensor_overhead()*2),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ false,
-            };
-
-            model.kvctx = ggml_init(params);
-            if (!model.kvctx) {
-                fprintf(stderr, "%s: kv ggml_init() failed\n", __func__);
-                return false;
-            }
-
-        }
-
-
-        model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    return true;
-}
-
-
-// feed-forward network
-ggml_tensor * gpt_neox_ff(
-        const gpt_neox_block &block,
-        ggml_context * ctx0,
-        ggml_tensor * inp,
-        const gpt_neox_hparams &hparams) {
-
-    ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
-
-    cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
-    cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
-    cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur);
-
-    // GELU activation
-    cur = ggml_gelu(ctx0, cur);
-
-    // projection
-    // cur = proj_w*cur + proj_b
-    cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur);
-
-    cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur);
-    return cur;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt_neox_eval(
-        const gpt_neox_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt2bpe_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_block = hparams.n_block;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_block; ++il) {
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // self-attention
-        {
-            {
-                cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
-                        ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur));
-            }
-
-            // compute QKV
-            {
-
-                cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur);
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur);
-            }
-
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
-
-            // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
-
-            // store key and value to memory
-            {
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur);
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur);
-            }
-        }
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        if (hparams.par_res == 0) {
-            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpFF);
-        } else {
-            struct ggml_tensor * inpFF = cur;
-
-            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
-
-            // layer input + FF
-            cur  = ggml_add(ctx0, cur, inpFF);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpL);
-        }
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt2bpe_vocab vocab;
-    gpt_neox_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt_neox_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.top_k == 0) {
-        params.top_k = model.hparams.n_vocab;
-    }
-
-    printf("%s: seed           = %d\n",   __func__, params.seed);
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    std::mt19937 rng(params.seed);
-
-    if (params.prompt.empty()) {
-        params.prompt = "Once upon";
-    }
-
-    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt2bpe_vocab::id> embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-//    for (size_t i = 0; i < embd_inp.size(); i++) {
-//        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str());
-//    }
-
-    if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) {
-        params.n_predict = model.hparams.n_ctx-embd_inp.size();
-    }
-
-    printf("%s: n_predict = %d\n", __func__, params.n_predict);
-    printf("\n");
-
-    std::vector<gpt2bpe_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-            const int repeat_last_n = params.repeat_last_n;
-            const float repeat_penalty = params.repeat_penalty;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt2bpe_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (embd.size() > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str()  );
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt
index 046f9b1e7..57d01cb0b 100644
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
 endif()
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 128d67080..6331335e3 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -39,8 +39,8 @@ static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting = false;
 
+static bool is_interacting = false;
 
 static void write_logfile(
     const llama_context * ctx, const gpt_params & params, const llama_model * model,
@@ -104,7 +104,7 @@ static void sigint_handler(int signo) {
 
 int main(int argc, char ** argv) {
     gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
     g_params = &params;
 
     if (!gpt_params_parse(argc, argv, params)) {
@@ -358,36 +358,10 @@ int main(int argc, char ** argv) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
-            }
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
     LOG_TEE("\n#####  Infill mode  #####\n\n");
     if (params.infill) {
         printf("\n************\n");
@@ -430,7 +404,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
     while (n_remain != 0 || params.interactive) {
         // predict
@@ -549,7 +523,7 @@ int main(int argc, char ** argv) {
 
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            llama_sampling_accept(ctx_sampling, ctx, id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
@@ -567,8 +541,11 @@ int main(int argc, char ** argv) {
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-                ctx_sampling->prev.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -600,7 +577,7 @@ int main(int argc, char ** argv) {
         if ((int) embd_inp.size() <= n_consumed) {
 
             // deal with eot token in infill mode
-            if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){
                 if(is_interacting && !params.interactive_first) {
                     // print an eot token
                     printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
@@ -617,7 +594,7 @@ int main(int argc, char ** argv) {
                     buffer += line;
                 } while (another_line);
                 // check if we got an empty line, if so we use the old input
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                     params.input_prefix = buffer;
                 }
                 buffer.clear();
@@ -627,7 +604,7 @@ int main(int argc, char ** argv) {
                     buffer += line;
                 } while (another_line);
                 // check if we got an empty line
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
+                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                     params.input_suffix = buffer;
                 }
                 buffer.clear();
@@ -640,7 +617,7 @@ int main(int argc, char ** argv) {
                     process_escapes(params.input_suffix);
                 }
                 suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                     params.input_suffix.erase(0, 1);
                     suff_rm_leading_spc = false;
                 }
@@ -667,7 +644,7 @@ int main(int argc, char ** argv) {
                 is_interacting = false;
             }
             // deal with end of text token in interactive mode
-            else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
+            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -740,15 +717,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                 }
                 is_interacting = false;
             }
@@ -778,9 +747,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
 #ifndef LOG_DISABLE_LOGS
diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h
index e050b59be..45b2b1ad3 100644
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -58,28 +58,30 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n
 
 // TODO: use common/sampling.h
 inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
-      // out of user input, sample next token
-    const float   temp      = params.sampling_params.temp;
-    const int32_t top_k     = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
-    const float   top_p     = params.sampling_params.top_p;
-    const float   tfs_z     = params.sampling_params.tfs_z;
-    const float   typical_p = params.sampling_params.typical_p;
-      // const int32_t repeat_last_n   = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
-      // const float   repeat_penalty  = params.sampling_params.repeat_penalty;
-      // const float   alpha_presence  = params.sampling_params.presence_penalty;
-      // const float   alpha_frequency = params.sampling_params.frequency_penalty;
-    const int     mirostat     = params.sampling_params.mirostat;
-    const float   mirostat_tau = params.sampling_params.mirostat_tau;
-    const float   mirostat_eta = params.sampling_params.mirostat_eta;
-      // const bool    penalize_nl     = params.sampling_params.penalize_nl;
+    auto & sparams = params.sparams;
+
+    // out of user input, sample next token
+    const float   temp      = sparams.temp;
+    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
+    const float   top_p     = sparams.top_p;
+    const float   tfs_z     = sparams.tfs_z;
+    const float   typical_p = sparams.typical_p;
+    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
+    // const float   repeat_penalty  = sparams.repeat_penalty;
+    // const float   alpha_presence  = sparams.presence_penalty;
+    // const float   alpha_frequency = sparams.frequency_penalty;
+    const int     mirostat     = sparams.mirostat;
+    const float   mirostat_tau = sparams.mirostat_tau;
+    const float   mirostat_eta = sparams.mirostat_eta;
+    // const bool    penalize_nl     = sparams.penalize_nl;
 
     llama_token id = 0;
     {
         auto logits  = llama_get_logits(ctx_llama);
         auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
 
-          // Apply params.logit_bias map
-        for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
+        // Apply params.logit_bias map
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
             logits[it->first] += it->second;
         }
 
@@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
 
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-          // TODO: Apply penalties
-          // float nl_logit = logits[llama_token_nl(ctx)];
-          // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-          // llama_sample_repetition_penalty(ctx, &candidates_p,
-          //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-          //      last_n_repeat, repeat_penalty);
-          // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-          // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-          // last_n_repeat, alpha_frequency, alpha_presence);
-          // if (!penalize_nl) {
-          //     logits[llama_token_nl(ctx)] = nl_logit;
-          // }
+        // TODO: Apply penalties
+        // float nl_logit = logits[llama_token_nl(ctx)];
+        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+        // llama_sample_repetition_penalty(ctx, &candidates_p,
+        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        //      last_n_repeat, repeat_penalty);
+        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        // last_n_repeat, alpha_frequency, alpha_presence);
+        // if (!penalize_nl) {
+        //     logits[llama_token_nl(ctx)] = nl_logit;
+        // }
 
         if (temp <= 0) {
               // Greedy sampling
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 1a5911c56..db5309afe 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
     if (!gpt_params_parse(argc, argv, params)) {
         return 1;
     }
-    llama_sampling_params & sparams = params.sampling_params;
+    llama_sampling_params & sparams = params.sparams;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
@@ -415,8 +415,7 @@ int main(int argc, char ** argv) {
             }
         }
     }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
 
@@ -459,7 +458,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -612,7 +611,7 @@ int main(int argc, char ** argv) {
 
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            llama_sampling_accept(ctx_sampling, ctx, id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
@@ -631,12 +630,9 @@ int main(int argc, char ** argv) {
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
 
-                // GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
-                //     Most likely will remove this in the future to avoid exposing "prev"
-                //     Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
-                //     penalty will be applied only based on the tokens generated by the model.
-                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-                ctx_sampling->prev.push_back(embd_inp[n_consumed]);
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -667,12 +663,10 @@ int main(int argc, char ** argv) {
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt
+            // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
-                std::string last_output;
-                for (auto id : ctx_sampling->prev) {
-                    last_output += llama_token_to_piece(ctx, id);
-                }
+                const int n_prev = 32;
+                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -699,7 +693,7 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 69f9526a4..eb64adef8 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.ctx_sampling = llama_sampling_init(params);
+        client.ctx_sampling = llama_sampling_init(params.sparams);
     }
 
     std::vector<llama_token> tokens_system;
@@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
 
                 const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
 
-                llama_sampling_accept(client.ctx_sampling, ctx, id);
+                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0471528a3..b5ad3cc99 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -195,10 +195,12 @@ struct llama_server_context
     json prompt;
     std::vector<llama_token> embd;
 
+    gpt_params params;
+
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
-    gpt_params params;
     llama_sampling_context *ctx_sampling = nullptr;
+
     int n_ctx;
 
     bool truncated = false;
@@ -232,7 +234,7 @@ struct llama_server_context
     void rewind()
     {
         params.antiprompt.clear();
-        params.grammar.clear();
+        params.sparams.grammar.clear();
         num_prompt_tokens = 0;
         num_tokens_predicted = 0;
         generated_text = "";
@@ -246,11 +248,14 @@ struct llama_server_context
         multibyte_pending = 0;
         n_remain = 0;
         n_past = 0;
+        params.sparams.n_prev = n_ctx;
+    }
 
+    void initSampling() {
         if (ctx_sampling != nullptr) {
             llama_sampling_free(ctx_sampling);
         }
-        ctx_sampling = llama_sampling_init(params);
+        ctx_sampling = llama_sampling_init(params.sparams);
     }
 
     bool loadModel(const gpt_params &params_)
@@ -311,16 +316,32 @@ struct llama_server_context
         return prompt_tokens;
     }
 
-    bool loadGrammar()
-    {
-        ctx_sampling = llama_sampling_init(params);
-        return true;
+    void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
+        const int n_left = n_ctx - params.n_keep;
+        const int n_block_size = n_left / 2;
+        const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
+
+        // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
+        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+
+        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+
+        LOG_VERBOSE("input truncated", {
+                {"n_ctx", n_ctx},
+                {"n_keep", params.n_keep},
+                {"n_left", n_left},
+                {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                {"num_prompt_tokens", new_tokens.size()}
+        });
+
+        truncated = true;
+        prompt_tokens = new_tokens;
     }
 
     void loadInfill()
     {
         bool suff_rm_leading_spc = true;
-        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
             params.input_suffix.erase(0, 1);
             suff_rm_leading_spc = false;
         }
@@ -336,6 +357,7 @@ struct llama_server_context
         prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
         prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
         prefix_tokens.push_back(llama_token_middle(ctx));
+
         auto prompt_tokens = prefix_tokens;
 
         num_prompt_tokens = prompt_tokens.size();
@@ -347,31 +369,18 @@ struct llama_server_context
         params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
 
         // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t)params.n_ctx)
+        if (num_prompt_tokens >= (size_t) n_ctx)
         {
-            printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
-            // todo we probably want to cut from both sides
-            const int n_left = (params.n_ctx - params.n_keep) / 2;
-            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
-            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
+            truncatePrompt(prompt_tokens);
+            num_prompt_tokens = prompt_tokens.size();
 
-            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", params.n_ctx},
-                                               {"n_keep", params.n_keep},
-                                               {"n_left", n_left},
-                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                                           });
-
-            truncated = true;
-            prompt_tokens = new_tokens;
+            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
         }
-        else
+
+        // push the prompt into the sampling context (do not apply grammar)
+        for (auto & token : prompt_tokens)
         {
-            const size_t ps = num_prompt_tokens;
-            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
+            llama_sampling_accept(ctx_sampling, ctx, token, false);
         }
 
         // compare the evaluated prompt with the new prompt
@@ -409,29 +418,18 @@ struct llama_server_context
         params.n_keep = std::min(n_ctx - 4, params.n_keep);
 
         // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t)n_ctx)
+        if (num_prompt_tokens >= (size_t) n_ctx)
         {
-            const int n_left = (n_ctx - params.n_keep) / 2;
-            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
-            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
+            truncatePrompt(prompt_tokens);
+            num_prompt_tokens = prompt_tokens.size();
 
-            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", n_ctx},
-                                               {"n_keep", params.n_keep},
-                                               {"n_left", n_left},
-                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                                           });
-
-            truncated = true;
-            prompt_tokens = new_tokens;
+            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
         }
-        else
+
+        // push the prompt into the sampling context (do not apply grammar)
+        for (auto & token : prompt_tokens)
         {
-            const size_t ps = num_prompt_tokens;
-            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
+            llama_sampling_accept(ctx_sampling, ctx, token, false);
         }
 
         // compare the evaluated prompt with the new prompt
@@ -530,8 +528,8 @@ struct llama_server_context
 
             llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
 
-            const int32_t n_probs = params.sampling_params.n_probs;
-            if (params.sampling_params.temp <= 0 && n_probs > 0)
+            const int32_t n_probs = params.sparams.n_probs;
+            if (params.sparams.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
                 llama_sample_softmax(ctx, &cur_p);
@@ -542,7 +540,7 @@ struct llama_server_context
                 result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
             }
 
-            llama_sampling_accept(ctx_sampling, ctx, result.tok);
+            llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
 
             if (tg) {
                 num_tokens_predicted++;
@@ -606,7 +604,7 @@ struct llama_server_context
         const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
         generated_text += token_text;
 
-        if (params.sampling_params.n_probs > 0)
+        if (params.sparams.n_probs > 0)
         {
             generated_token_probs.push_back(token_with_probs);
         }
@@ -1004,36 +1002,36 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto & sparams = llama.params.sampling_params;
+    const auto & sparams = llama.params.sparams;
     const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
     const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
                             eos_bias->second < 0.0f && std::isinf(eos_bias->second);
 
     return json{
-        {"n_ctx", llama.n_ctx},
-        {"model", llama.params.model_alias},
-        {"seed", llama.params.seed},
-        {"temp", sparams.temp},
-        {"top_k", sparams.top_k},
-        {"top_p", sparams.top_p},
-        {"tfs_z", sparams.tfs_z},
-        {"typical_p", sparams.typical_p},
-        {"repeat_last_n", sparams.repeat_last_n},
-        {"repeat_penalty", sparams.repeat_penalty},
-        {"presence_penalty", sparams.presence_penalty},
-        {"frequency_penalty", sparams.frequency_penalty},
-        {"mirostat", sparams.mirostat},
-        {"mirostat_tau", sparams.mirostat_tau},
-        {"mirostat_eta", sparams.mirostat_eta},
-        {"penalize_nl", sparams.penalize_nl},
-        {"stop", llama.params.antiprompt},
-        {"n_predict", llama.params.n_predict},
-        {"n_keep", llama.params.n_keep},
-        {"ignore_eos", ignore_eos},
-        {"stream", llama.stream},
-        {"logit_bias", sparams.logit_bias},
-        {"n_probs", sparams.n_probs},
-        {"grammar", llama.params.grammar},
+        {"n_ctx",             llama.n_ctx},
+        {"model",             llama.params.model_alias},
+        {"seed",              llama.params.seed},
+        {"temp",              sparams.temp},
+        {"top_k",             sparams.top_k},
+        {"top_p",             sparams.top_p},
+        {"tfs_z",             sparams.tfs_z},
+        {"typical_p",         sparams.typical_p},
+        {"repeat_last_n",     sparams.penalty_last_n},
+        {"repeat_penalty",    sparams.penalty_repeat},
+        {"frequency_penalty", sparams.penalty_freq},
+        {"presence_penalty",  sparams.penalty_present},
+        {"mirostat",          sparams.mirostat},
+        {"mirostat_tau",      sparams.mirostat_tau},
+        {"mirostat_eta",      sparams.mirostat_eta},
+        {"penalize_nl",       sparams.penalize_nl},
+        {"stop",              llama.params.antiprompt},
+        {"n_predict",         llama.params.n_predict},
+        {"n_keep",            llama.params.n_keep},
+        {"ignore_eos",        ignore_eos},
+        {"stream",            llama.stream},
+        {"logit_bias",        sparams.logit_bias},
+        {"n_probs",           sparams.n_probs},
+        {"grammar",           llama.params.sparams.grammar},
     };
 }
 
@@ -1081,7 +1079,7 @@ static json format_final_response(llama_server_context &llama, const std::string
         {"timings", format_timings(llama)},
     };
 
-    if (llama.params.sampling_params.n_probs > 0)
+    if (llama.params.sparams.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
@@ -1097,7 +1095,7 @@ static json format_partial_response(
         {"stop", false},
     };
 
-    if (llama.params.sampling_params.n_probs > 0)
+    if (llama.params.sparams.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
@@ -1129,28 +1127,30 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
     gpt_params default_params;
-    const auto & default_sparams = default_params.sampling_params;
-    auto & sparams = llama.params.sampling_params;
+    const auto & default_sparams = default_params.sparams;
 
-    llama.stream = json_value(body, "stream", false);
-    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
-    sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
-    sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
-    sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
-    sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
-    sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
-    sparams.temp = json_value(body, "temperature", default_sparams.temp);
-    sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
-    sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
-    sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
-    sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
-    sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
-    llama.params.seed = json_value(body, "seed", default_params.seed);
-    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-    sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
+    auto & params  = llama.params;
+    auto & sparams = llama.params.sparams;
+
+    llama.stream            = json_value(body, "stream",            false);
+    params.n_predict        = json_value(body, "n_predict",         default_params.n_predict);
+    sparams.top_k           = json_value(body, "top_k",             default_sparams.top_k);
+    sparams.top_p           = json_value(body, "top_p",             default_sparams.top_p);
+    sparams.tfs_z           = json_value(body, "tfs_z",             default_sparams.tfs_z);
+    sparams.typical_p       = json_value(body, "typical_p",         default_sparams.typical_p);
+    sparams.temp            = json_value(body, "temperature",       default_sparams.temp);
+    sparams.penalty_last_n  = json_value(body, "repeat_last_n",     default_sparams.penalty_last_n);
+    sparams.penalty_repeat  = json_value(body, "repeat_penalty",    default_sparams.penalty_repeat);
+    sparams.penalty_freq    = json_value(body, "frequency_penalty", default_sparams.penalty_freq);
+    sparams.penalty_present = json_value(body, "presence_penalty",  default_sparams.penalty_present);
+    sparams.mirostat        = json_value(body, "mirostat",          default_sparams.mirostat);
+    sparams.mirostat_tau    = json_value(body, "mirostat_tau",      default_sparams.mirostat_tau);
+    sparams.mirostat_eta    = json_value(body, "mirostat_eta",      default_sparams.mirostat_eta);
+    sparams.penalize_nl     = json_value(body, "penalize_nl",       default_sparams.penalize_nl);
+    params.n_keep           = json_value(body, "n_keep",            default_params.n_keep);
+    params.seed             = json_value(body, "seed",              default_params.seed);
+    sparams.grammar         = json_value(body, "grammar",           default_sparams.grammar);
+    sparams.n_probs         = json_value(body, "n_probs",           default_sparams.n_probs);
 
     if (body.count("prompt") != 0)
     {
@@ -1204,8 +1204,6 @@ static void parse_options_completion(const json &body, llama_server_context &lla
         }
     }
 
-    llama.ctx_sampling = llama_sampling_init(llama.params);
-
     LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }
 
@@ -1374,15 +1372,9 @@ int main(int argc, char **argv)
         llama.rewind();
 
         llama_reset_timings(llama.ctx);
-
         parse_options_completion(json::parse(req.body), llama);
 
-        if (!llama.loadGrammar())
-        {
-            res.status = 400;
-            return;
-        }
-
+        llama.initSampling();
         llama.loadPrompt();
         llama.beginCompletion();
 
@@ -1414,7 +1406,7 @@ int main(int argc, char **argv)
             }
 
             auto probs = llama.generated_token_probs;
-            if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
+            if (llama.params.sparams.n_probs > 0 && llama.stopped_word) {
                 const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                 probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
             }
@@ -1466,7 +1458,7 @@ int main(int argc, char **argv)
 
                         std::vector<completion_token_output> probs_output = {};
 
-                        if (llama.params.sampling_params.n_probs > 0) {
+                        if (llama.params.sparams.n_probs > 0) {
                             const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                             size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                             size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1537,14 +1529,9 @@ int main(int argc, char **argv)
         llama.rewind();
 
         llama_reset_timings(llama.ctx);
-
         parse_options_infill(json::parse(req.body), llama);
 
-        if (!llama.loadGrammar())
-        {
-            res.status = 400;
-            return;
-        }
+        llama.initSampling();
         llama.loadInfill();
         llama.beginCompletion();
         const auto chunked_content_provider = [&](size_t, DataSink & sink) {
@@ -1587,7 +1574,7 @@ int main(int argc, char **argv)
 
                     std::vector<completion_token_output> probs_output = {};
 
-                    if (llama.params.sampling_params.n_probs > 0) {
+                    if (llama.params.sparams.n_probs > 0) {
                         const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                         size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                         size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1694,7 +1681,9 @@ int main(int argc, char **argv)
         const json body = json::parse(req.body);
 
         llama.rewind();
+
         llama_reset_timings(llama.ctx);
+
         if (body.count("content") != 0)
         {
             llama.prompt = body["content"];
@@ -1704,6 +1693,8 @@ int main(int argc, char **argv)
             llama.prompt = "";
         }
         llama.params.n_predict = 0;
+
+        llama.initSampling();
         llama.loadPrompt();
         llama.beginCompletion();
         llama.doCompletion();
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 24f49012a..894321ce9 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -112,16 +112,16 @@ int main(int argc, char ** argv) {
     bool has_eos = false;
 
     // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
 
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
 
-    params.grammar.clear();             // the draft samplers will copy the target sampler's grammar
-    params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp);
+    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
+    params.sparams.temp = std::max(0.01f, params.sparams.temp);
 
     for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params);
+        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
     }
 
     llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@@ -154,7 +154,7 @@ int main(int argc, char ** argv) {
             // sample from the target model
             llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
 
-            llama_sampling_accept(ctx_sampling, ctx_tgt, id);
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
             //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
 
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
 
                     const int s = sa[is];
 
-                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);
+                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
 
                     drafts[s].tokens.push_back(id);
 
diff --git a/llama.cpp b/llama.cpp
index ec8ffad33..365349335 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1018,8 +1018,8 @@ enum e_model {
 };
 
 static const size_t kB = 1024;
-static const size_t MB = kB*kB;
-static const size_t GB = kB*kB*kB;
+static const size_t MB = 1024*kB;
+static const size_t GB = 1024*MB;
 
 struct llama_hparams {
     bool     vocab_only;
@@ -1042,21 +1042,21 @@ struct llama_hparams {
     float f_max_alibi_bias;
 
     bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only != other.vocab_only) return true;
-        if (this->n_vocab != other.n_vocab) return true;
+        if (this->vocab_only  != other.vocab_only)  return true;
+        if (this->n_vocab     != other.n_vocab)     return true;
         if (this->n_ctx_train != other.n_ctx_train) return true;
-        if (this->n_embd != other.n_embd) return true;
-        if (this->n_head != other.n_head) return true;
-        if (this->n_head_kv != other.n_head_kv) return true;
-        if (this->n_layer != other.n_layer) return true;
-        if (this->n_rot != other.n_rot) return true;
-        if (this->n_ff != other.n_ff) return true;
+        if (this->n_embd      != other.n_embd)      return true;
+        if (this->n_head      != other.n_head)      return true;
+        if (this->n_head_kv   != other.n_head_kv)   return true;
+        if (this->n_layer     != other.n_layer)     return true;
+        if (this->n_rot       != other.n_rot)       return true;
+        if (this->n_ff        != other.n_ff)        return true;
 
         const float EPSILON = 1e-9;
 
-        if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
+        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
+        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
         if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
 
         return false;
@@ -1195,11 +1195,11 @@ struct llama_vocab {
     id special_sep_id = -1;
     id special_pad_id = -1;
 
-    id linefeed_id = 13;
+    id linefeed_id       = 13;
     id special_prefix_id = 32007;
     id special_middle_id = 32009;
     id special_suffix_id = 32008;
-    id special_eot_id = 32010;
+    id special_eot_id    = 32010;
 
     int find_bpe_rank(std::string token_left, std::string token_right) const {
         replace_all(token_left,  " ",  "\u0120");
@@ -1359,10 +1359,7 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
-    // TODO: this should be:
-    //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
-    //       change it and test that it works
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
     memset(cache.buf.data, 0, cache.buf.size);
 
     struct ggml_init_params params;
@@ -7417,37 +7414,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
     llama_sample_temp(ctx, candidates_p, temp);
 }
 
-void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
-    if (last_tokens_size == 0 || penalty == 1.0f) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
-        if (token_iter == last_tokens + last_tokens_size) {
-            continue;
-        }
-
-        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
-        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (candidates->data[i].logit <= 0) {
-            candidates->data[i].logit *= penalty;
-        } else {
-            candidates->data[i].logit /= penalty;
-        }
-    }
-
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
-    if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
+void llama_sample_repetition_penalties(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present) {
+    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
         return;
     }
 
@@ -7455,19 +7430,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
 
     // Create a frequency map to count occurrences of each token in last_tokens
     std::unordered_map<llama_token, int> token_count;
-    for (size_t i = 0; i < last_tokens_size; ++i) {
-        token_count[last_tokens_p[i]]++;
+    for (size_t i = 0; i < penalty_last_n; ++i) {
+        token_count[last_tokens[i]]++;
     }
 
     // Apply frequency and presence penalties to the candidates
     for (size_t i = 0; i < candidates->size; ++i) {
-        auto token_iter = token_count.find(candidates->data[i].id);
+        const auto token_iter = token_count.find(candidates->data[i].id);
         if (token_iter == token_count.end()) {
             continue;
         }
 
-        int count = token_iter->second;
-        candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
+        const int count = token_iter->second;
+
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (candidates->data[i].logit <= 0) {
+            candidates->data[i].logit *= penalty_repeat;
+        } else {
+            candidates->data[i].logit /= penalty_repeat;
+        }
+
+        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
     }
 
     candidates->sorted = false;
diff --git a/llama.h b/llama.h
index 51010e037..306f5b383 100644
--- a/llama.h
+++ b/llama.h
@@ -560,21 +560,15 @@ extern "C" {
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
 
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                          float    penalty);
-
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+    LLAMA_API void llama_sample_repetition_penalties(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
                const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                           float   alpha_frequency,
-                           float   alpha_presence);
+                          size_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present);
 
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 019c0d462..32e58941c 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -8,11 +8,9 @@
 #include <cmath>
 #include <numeric>
 #include <cassert>
-#include <iostream>
 #include <vector>
 #include <algorithm>
 
-
 static void dump(const llama_token_data_array * candidates) {
     for (size_t i = 0; i < candidates->size; i++) {
         printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
@@ -21,7 +19,6 @@ static void dump(const llama_token_data_array * candidates) {
 
 #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
 
-
 static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -37,13 +34,12 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
     llama_sample_top_k(nullptr, &candidates_p, k, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
     }
 }
 
-
 static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -59,13 +55,12 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
     llama_sample_top_p(nullptr, &candidates_p, p, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
-
 static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -80,13 +75,12 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
     llama_sample_tail_free(nullptr, &candidates_p, z, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
-
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -101,18 +95,17 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
     llama_sample_typical(nullptr, &candidates_p, p, 1);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
-
-static void test_repetition_penalty(
+static void test_repetition_penalties(
     const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float penalty
+    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
 ) {
-    assert(probs.size() == expected_probs.size());
+    GGML_ASSERT(probs.size() == expected_probs.size());
 
     size_t n_vocab = probs.size();
     std::vector<llama_token_data> candidates;
@@ -125,41 +118,13 @@ static void test_repetition_penalty(
     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
     llama_sample_softmax(nullptr, &candidates_p);
     DUMP(&candidates_p);
-    llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
+    llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
     llama_sample_softmax(nullptr, &candidates_p);
     DUMP(&candidates_p);
 
-    assert(candidates_p.size == expected_probs.size());
+    GGML_ASSERT(candidates_p.size == expected_probs.size());
     for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
-    }
-}
-
-
-static void test_frequency_presence_penalty(
-    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
-) {
-    assert(probs.size() == expected_probs.size());
-
-    size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        float logit = log(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    // DUMP(&candidates_p);
-    llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
-    llama_sample_softmax(nullptr, &candidates_p);
-    // DUMP(&candidates_p);
-
-    assert(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
     }
 }
 
@@ -181,13 +146,13 @@ int main(void) {
     test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
     test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
 
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
-    test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
 
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
-    test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
+    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
 
     printf("OK\n");
 

From 465219b9143ac01db0990bbcb0a081ef72ec2008 Mon Sep 17 00:00:00 2001
From: shibe2 <shibe@tuta.io>
Date: Thu, 12 Oct 2023 16:01:23 +0400
Subject: [PATCH 27/28] CLBlast: Add outer loops over src0 for broadcasting in
 mulmat

Reduce repeated dequantization of the same data.
---
 ggml-opencl.cpp | 332 +++++++++++++++++++++++-------------------------
 1 file changed, 162 insertions(+), 170 deletions(-)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 67ac20eac..202bcb485 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
 
     size_t x_offset = 0;
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
 
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                if (src0->backend == GGML_BACKEND_GPU) {
+                    x_offset = (i03 * ne02 + i02) * x_ne;
+                } else {
+                    // copy src0 to device
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+                }
 
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    // copy src1 to device
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
 
-            // copy data to device
-            if (src0->backend == GGML_BACKEND_GPU) {
-                x_offset = (i03 * ne02 + i02) * x_ne;
-            } else if (i02 != pi02 || i03 != pi03) {
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                pi02 = i02;
-                pi03 = i03;
+                    CL_CHECK(clFinish(queue));
+
+                    // compute
+                    cl_event ev_sgemm;
+                    clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                               ne01, ne11, ne10,
+                                                               alpha,
+                                                               d_X, x_offset, ne00,
+                                                               d_Y, 0, ne10,
+                                                               beta,
+                                                               d_D, 0, ne01,
+                                                               &queue, &ev_sgemm);
+
+                    if (status != clblast::StatusCode::kSuccess) {
+                        GGML_ASSERT(false);
+                    }
+
+                    // copy dst to host
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+                }
             }
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-            CL_CHECK(clFinish(queue));
-
-            // compute
-            cl_event ev_sgemm;
-            clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                       ne01, ne11, ne10,
-                                                       alpha,
-                                                       d_X, x_offset, ne00,
-                                                       d_Y, 0, ne10,
-                                                       beta,
-                                                       d_D, 0, ne01,
-                                                       &queue, &ev_sgemm);
-
-            if (status != clblast::StatusCode::kSuccess) {
-                GGML_ASSERT(false);
-            }
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
         }
     }
 
@@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
     bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
 
     size_t x_offset = 0;
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
 
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
-
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
-
-            // copy src0 to device
-            if (src0->backend == GGML_BACKEND_GPU) {
-                x_offset = (i03 * ne02 + i02) * x_ne;
-            } else if (i02 != pi02 || i03 != pi03) {
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                pi02 = i02;
-                pi03 = i03;
-            }
-
-            // convert src1 to fp16
-            // TODO: use multiple threads
-            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
-            if (src1_cont_rows) {
-                if (src1_cont_cols) {
-                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                if (src0->backend == GGML_BACKEND_GPU) {
+                    x_offset = (i03 * ne02 + i02) * x_ne;
+                } else {
+                    // copy src0 to device
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                 }
-                else {
-                    for (int64_t i11 = 0; i11 < ne11; i11++) {
-                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    // convert src1 to fp16
+                    // TODO: use multiple threads
+                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
+                    if (src1_cont_rows) {
+                        if (src1_cont_cols) {
+                            ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                        }
+                        else {
+                            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                                ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+                            }
+                        }
                     }
-                }
-            }
-            else {
-                for (int64_t i11 = 0; i11 < ne11; i11++) {
-                    for (int64_t i10 = 0; i10 < ne10; i10++) {
-                        // very slow due to no inlining
-                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                    else {
+                        for (int64_t i11 = 0; i11 < ne11; i11++) {
+                            for (int64_t i10 = 0; i10 < ne10; i10++) {
+                                // very slow due to no inlining
+                                tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                            }
+                        }
                     }
+
+                    // copy src1 to device
+                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
+
+                    CL_CHECK(clFinish(queue));
+
+                    // compute
+                    cl_event ev_sgemm;
+                    clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
+                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                               ne01, ne11, ne10,
+                                                               alpha,
+                                                               d_X, x_offset, ne00,
+                                                               d_Y, 0, ne10,
+                                                               beta,
+                                                               d_D, 0, ne01,
+                                                               &queue, &ev_sgemm);
+
+                    if (status != clblast::StatusCode::kSuccess) {
+                        GGML_ASSERT(false);
+                    }
+
+                    // copy dst to host, then convert to float
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+                    ggml_fp16_to_fp32_row(tmp, d, d_ne);
                 }
             }
-
-            // copy src1 to device
-            CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
-
-            CL_CHECK(clFinish(queue));
-
-            // compute
-            cl_event ev_sgemm;
-            clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
-                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                       ne01, ne11, ne10,
-                                                       alpha,
-                                                       d_X, x_offset, ne00,
-                                                       d_Y, 0, ne10,
-                                                       beta,
-                                                       d_D, 0, ne01,
-                                                       &queue, &ev_sgemm);
-
-            if (status != clblast::StatusCode::kSuccess) {
-                GGML_ASSERT(false);
-            }
-
-            // copy dst to host, then convert to float
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-
-            ggml_fp16_to_fp32_row(tmp, d, d_ne);
         }
     }
 
@@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
     size_t ev_idx = 0;
     std::vector<cl_event> events;
 
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
-
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
-
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
-
-            // copy src0 to device if necessary
-            if (src0->backend == GGML_BACKEND_CPU) {
-                if (i02 != pi02 || i03 != pi03) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        // TODO: copy and dequantize src0 here when r3>1
+        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                // copy src0 to device if necessary
+                if (src0->backend == GGML_BACKEND_CPU) {
                     events.emplace_back();
                     CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                    pi02 = i02;
-                    pi03 = i03;
-                }
-            } else if (src0->backend == GGML_BACKEND_GPU) {
-                d_Q = (cl_mem) src0->extra;
-            } else {
-                GGML_ASSERT(false);
-            }
-            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
-                // copy src1 to device
-                events.emplace_back();
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
-
-                // compute
-                const size_t global = ne01 * local;
-                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                const cl_int ncols = ne00;
-                events.emplace_back();
-                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
-                CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
-                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
-                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
-                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
-            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
-                // convert src0 to fp32 on device
-                const size_t global = x_ne / global_denom;
-                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
-                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
-
-                // copy src1 to device
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-                events.emplace_back();
-
-                // wait for conversion
-                CL_CHECK(clFinish(queue));
-
-                // compute
-                clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                           clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                           ne01, ne11, ne10,
-                                                           alpha,
-                                                           d_X, 0, ne00,
-                                                           d_Y, 0, ne10,
-                                                           beta,
-                                                           d_D, 0, ne01,
-                                                           &queue, events.data() + ev_idx++);
-
-                if (status != clblast::StatusCode::kSuccess) {
+                } else if (src0->backend == GGML_BACKEND_GPU) {
+                    d_Q = (cl_mem) src0->extra;
+                } else {
                     GGML_ASSERT(false);
                 }
-            }
 
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
-            for (auto *event : events) {
-                clReleaseEvent(event);
-            }
+                if (!mul_mat_vec) {
+                    // convert src0 to fp32 on device
+                    const size_t global = x_ne / global_denom;
+                    const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
+                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
+                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
+                }
 
-            ev_idx = 0;
-            events.clear();
+                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
+                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                        // copy src1 to device
+                        events.emplace_back();
+                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
+
+                        // compute
+                        const size_t global = ne01 * local;
+                        const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                        const cl_int ncols = ne00;
+                        events.emplace_back();
+                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
+                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
+                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
+                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
+                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
+                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+                    } else { // CLBlast matrix matrix multiplication
+                        // copy src1 to device
+                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+
+                        // wait for conversion
+                        CL_CHECK(clFinish(queue));
+
+                        // compute
+                        events.emplace_back();
+                        clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                                   clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                                   ne01, ne11, ne10,
+                                                                   alpha,
+                                                                   d_X, 0, ne00,
+                                                                   d_Y, 0, ne10,
+                                                                   beta,
+                                                                   d_D, 0, ne01,
+                                                                   &queue, events.data() + ev_idx++);
+
+                        if (status != clblast::StatusCode::kSuccess) {
+                            GGML_ASSERT(false);
+                        }
+                    }
+
+                    // copy dst to host
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
+                    for (auto *event : events) {
+                        clReleaseEvent(event);
+                    }
+
+                    ev_idx = 0;
+                    events.clear();
+                }
+            }
         }
     }
 

From 22c69a27945e7acf9690dd3210d316f22182751c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 22 Oct 2023 08:37:20 +0300
Subject: [PATCH 28/28] batched : add len CLI argument

---
 examples/batched/batched.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 155212165..2797329b4 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -11,12 +11,16 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
         return 1 ;
     }
 
+    // number of parallel batches
     int n_parallel = 1;
 
+    // total length of the sequences including the prompt
+    int n_len = 32;
+
     if (argc >= 2) {
         params.model = argv[1];
     }
@@ -29,13 +33,14 @@ int main(int argc, char ** argv) {
         n_parallel = std::atoi(argv[3]);
     }
 
+    if (argc >= 5) {
+        n_len = std::atoi(argv[4]);
+    }
+
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
     }
 
-    // total length of the sequences including the prompt
-    const int n_len = 32;
-
     // init LLM
 
     llama_backend_init(params.numa);