From ba2135ccae7462470b3865c6e41d2e1d734eac05 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 21 Feb 2024 22:18:23 +0100
Subject: [PATCH 01/13] gemma : allow offloading the output tensor (#5646)

---
 llama.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 3a226c426..4054d5da6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4394,6 +4394,8 @@ static bool llm_load_tensors(
 
                     // output
                     model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
+                    ml.n_created--; // artificial tensor
 
                     const int64_t n_ff          = hparams.n_ff;
                     const int64_t n_embd_head_k = hparams.n_embd_head_k;
@@ -7525,7 +7527,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+        cur = ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);

From 7fe4678b0244ba7b03eae66ebeaa947e2770bb1a Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 21 Feb 2024 22:52:39 +0100
Subject: [PATCH 02/13] llama : fix session save/load with quantized KV (#5649)

---
 llama.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4054d5da6..d763cc80c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12176,18 +12176,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         data_ctx->write(&kv_used,     sizeof(kv_used));
 
         if (kv_buf_size) {
-            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
-
             std::vector<uint8_t> tmp_buf;
             for (int il = 0; il < (int) n_layer; ++il) {
-                tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
+                size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+                tmp_buf.resize(k_size);
                 ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
                 data_ctx->write(tmp_buf.data(), tmp_buf.size());
 
                 // v is not contiguous, copy row by row
-                tmp_buf.resize(elt_size*kv_head);
+                size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
+                size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
+                tmp_buf.resize(v_row_size);
                 for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
+                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
                     data_ctx->write(tmp_buf.data(), tmp_buf.size());
                 }
             }
@@ -12289,17 +12290,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
         if (kv_buf_size) {
             GGML_ASSERT(kv_self.total_size() == kv_buf_size);
 
-            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
-
             for (int il = 0; il < (int) n_layer; ++il) {
-                size_t k_size = elt_size*n_embd_k_gqa*kv_head;
+                size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
                 ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
                 inp += k_size;
 
                 // v is not contiguous, copy row by row
-                size_t v_row_size = elt_size*kv_head;
+                size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
+                size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
                 for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
+                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
                     inp += v_row_size;
                 }
             }

From 7c8bcc11dc61cf5930b70cd0168b84afcebe12a9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 22 Feb 2024 00:31:00 +0100
Subject: [PATCH 03/13] Add docs for llama_chat_apply_template (#5645)

* add docs for llama_chat_apply_template

* fix typo
---
 examples/server/README.md | 1 +
 llama.h                   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 6d9f96cd4..4b24ee5dc 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
 - `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
+- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 
 ## Build
 
diff --git a/llama.h b/llama.h
index 8ba20696f..84f196b3b 100644
--- a/llama.h
+++ b/llama.h
@@ -708,7 +708,7 @@ extern "C" {
 
     /// Apply chat template. Inspired by hf apply_chat_template() on python.
     /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-    /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
+    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
     /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
     /// @param chat Pointer to a list of multiple llama_chat_message
     /// @param n_msg Number of llama_chat_message in this chat

From 973053d8b0d04809836b3339a50f68d9c842de90 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 22 Feb 2024 00:42:09 +0100
Subject: [PATCH 04/13] llama : fix loading models with shared tok_embd and
 output (#5651)

ggml-ci
---
 llama.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d763cc80c..259f2a3a3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2791,13 +2791,7 @@ struct llama_model_loader {
 
         std::vector<no_init<uint8_t>> read_buf;
 
-        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
-            if (!cur) {
-                // some tensors may be allocated in a different context
-                continue;
-            }
-
+        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
             if (progress_callback) {
                 if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
                     return false;
@@ -3722,7 +3716,7 @@ static bool llm_load_tensors(
     }
 
     // create one context per buffer type
-    size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
+    size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     for (auto & it : buft_layer_count) {
         struct ggml_init_params params = {
@@ -3860,6 +3854,7 @@ static bool llm_load_tensors(
                         } else {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                             ml.n_created--; // artificial tensor
+                            ml.size_data += ggml_nbytes(model.output);
                         }
                     }
 
@@ -4396,6 +4391,7 @@ static bool llm_load_tensors(
                     model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                     model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
                     ml.n_created--; // artificial tensor
+                    ml.size_data += ggml_nbytes(model.output);
 
                     const int64_t n_ff          = hparams.n_ff;
                     const int64_t n_embd_head_k = hparams.n_embd_head_k;

From 4ef245a92a968ba0f18a5adfd41e51980ce4fdf5 Mon Sep 17 00:00:00 2001
From: Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
Date: Thu, 22 Feb 2024 18:15:13 +1000
Subject: [PATCH 05/13] mpt : add optional bias tensors (#5638)

Update for MPT with optional bias parameters: to work with PhoGPT and SEA-LION models that were pre-trained with 'bias'.
---
 llama.cpp | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 259f2a3a3..9cae8c761 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4054,6 +4054,8 @@ static bool llm_load_tensors(
                     // output
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
+
                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
@@ -4063,14 +4065,23 @@ static bool llm_load_tensors(
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, false);
 
                         layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, false);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, false);
 
                         // AWQ ScaleActivation layer
                         layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -6171,7 +6182,7 @@ struct llm_build_context {
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
-                    NULL,
+                    model.layers[il].attn_norm_b,
                     LLM_NORM, cb, il);
             cb(attn_norm, "attn_norm", il);
 
@@ -6181,6 +6192,11 @@ struct llm_build_context {
 
                 cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
+                
+                if (model.layers[il].bqkv){
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
 
                 if (hparams.f_clamp_kqv > 0.0f) {
                     cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -6198,7 +6214,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 cb(cur, "kqv_out", il);
             }
@@ -6211,13 +6227,13 @@ struct llm_build_context {
             {
                 cur = llm_build_norm(ctx0, ffn_inp, hparams,
                         model.layers[il].ffn_norm,
-                        NULL,
+                        model.layers[il].ffn_norm_b,
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
                 cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                         NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                         model.layers[il].ffn_act,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                 cb(cur, "ffn_out", il);
@@ -6234,7 +6250,7 @@ struct llm_build_context {
 
         cur = llm_build_norm(ctx0, cur, hparams,
                 model.output_norm,
-                NULL,
+                model.output_norm_b,
                 LLM_NORM, cb, -1);
         cb(cur, "result_norm", -1);
 

From c5688c6250430d2b8e0259efcf26c16dfa4c1f46 Mon Sep 17 00:00:00 2001
From: Alexey Parfenov <zxed@alkatrazstudio.net>
Date: Thu, 22 Feb 2024 08:27:32 +0000
Subject: [PATCH 06/13] server : clarify some params in the docs (#5640)

---
 examples/server/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 4b24ee5dc..4b6cd8326 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -151,7 +151,7 @@ node index.js
 
     `temperature`: Adjust the randomness of the generated text (default: 0.8).
 
-    `dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
+    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
 
     `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
 
@@ -209,7 +209,7 @@ node index.js
 
     `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
 
-    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
 
     `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 
@@ -242,7 +242,7 @@ Notice that each `probs` is an array of length `n_probs`.
 
 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
-- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
 - `model`: The path to the model loaded with `-m`
 - `prompt`: The provided `prompt`
 - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token

From a46f50747b2028f7f9c9883b26bfba12bf92556e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 22 Feb 2024 09:33:24 +0100
Subject: [PATCH 07/13] server : fallback to chatml, add AlphaMonarch chat
 template (#5628)

* server: fallback to chatml

* add new chat template

* server: add AlphaMonarch to test chat template

* server: only check model template if there is no custom tmpl

* remove TODO
---
 examples/server/server.cpp   | 15 +++++++++++++++
 llama.cpp                    |  9 +++++++++
 tests/test-chat-template.cpp | 23 +++++++++++++++--------
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c84719a0d..369121e88 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -400,6 +400,16 @@ struct llama_server_context
         return true;
     }
 
+    void validate_model_chat_template(server_params & sparams) {
+        llama_chat_message chat[] = {{"user", "test"}};
+        std::vector<char> buf(1);
+        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
+        if (res < 0) {
+            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
+        }
+    }
+
     void initialize() {
         // create slots
         all_slots_are_idle = true;
@@ -2752,6 +2762,11 @@ int main(int argc, char **argv)
         LOG_INFO("model loaded", {});
     }
 
+    if (sparams.chat_template.empty()) { // custom chat template is not supplied
+        // check if the template comes with the model is supported by us
+        llama.validate_model_chat_template(sparams);
+    }
+
     // Middleware for API key validation
     auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
         // If API key is not set, skip validation
diff --git a/llama.cpp b/llama.cpp
index 9cae8c761..055b57e31 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12773,6 +12773,15 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
+    } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
+        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
+        for (auto message : chat) {
+            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
+            ss << bos << message->role << "\n" << message->content << "</s>\n";
+        }
+        if (add_ass) {
+            ss << "<s>assistant\n";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 9830650d4..d02b39e14 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -27,12 +27,20 @@ int main(void) {
         "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
         // bofenghuang/vigogne-2-70b-chat
         "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+        // mlabonne/AlphaMonarch-7B
+        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
     };
-    std::vector<std::string> expected_substr = {
-        "<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
-        "[/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
-        "</s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
-        "[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+    std::vector<std::string> expected_output = {
+        // teknium/OpenHermes-2.5-Mistral-7B
+        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
+        // mistralai/Mistral-7B-Instruct-v0.2
+        "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
+        // TheBloke/FusionNet_34Bx2_MoE-AWQ
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+        // bofenghuang/vigogne-2-70b-chat
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+        // mlabonne/AlphaMonarch-7B
+        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;
@@ -43,7 +51,7 @@ int main(void) {
 
     for (size_t i = 0; i < templates.size(); i++) {
         std::string custom_template = templates[i];
-        std::string substr = expected_substr[i];
+        std::string expected = expected_output[i];
         formatted_chat.resize(1024);
         res = llama_chat_apply_template(
             nullptr,
@@ -57,8 +65,7 @@ int main(void) {
         formatted_chat.resize(res);
         std::string output(formatted_chat.data(), formatted_chat.size());
         std::cout << output << "\n-------------------------\n";
-        // expect the "formatted_chat" to contain pre-defined strings
-        assert(output.find(substr) != std::string::npos);
+        assert(output == expected);
     }
     return 0;
 }

From 56d03d92be57f5880b9ed94542d87bb6effae31f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 22 Feb 2024 10:35:54 +0200
Subject: [PATCH 08/13] readme : update hot topics

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c1624b9f9..3bc512af0 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ### Hot topics
 
+- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 - Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
 - Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216

From 3a03541cedea474fa9d41214484cc3fbcf468a9e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 22 Feb 2024 13:54:03 +0200
Subject: [PATCH 09/13] minor : fix trailing whitespace (#5638)

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 055b57e31..6ab5e1bf4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6192,7 +6192,7 @@ struct llm_build_context {
 
                 cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
-                
+
                 if (model.layers[il].bqkv){
                     cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
                     cb(cur, "bqkv", il);

From 4cb4d8b22d4fda971621a68c570ce84d66897c37 Mon Sep 17 00:00:00 2001
From: Someone <sergei.kozlukov@aalto.fi>
Date: Thu, 22 Feb 2024 16:32:09 +0000
Subject: [PATCH 10/13] workflows: nix: hardcode cachix ids, build
 unconditionally (#5663)

GitHub does not expose environment and repository variables to PRs coming from forks implies that we've been disabling the Nix CI actions for most PRs.

The `if:` also didn't make much sense, because we can always pull from cachix, and there's no point (albeit no risk either) in pushing cache for the untrusted code.
---
 .github/workflows/nix-ci-aarch64.yml |  7 +++----
 .github/workflows/nix-ci.yml         | 11 +++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
index 0c6cf5f09..8d0a3fd7f 100644
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -19,7 +19,6 @@ on:
 
 jobs:
   nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
     runs-on: ubuntu-latest
     steps:
     - name: Checkout repository
@@ -37,8 +36,8 @@ jobs:
         extra-conf: |
           extra-platforms = aarch64-linux
           extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -46,7 +45,7 @@ jobs:
       uses: cachix/cachix-action@v13
       with:
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
     - name: Show all output paths
       run: >
           nix run github:nix-community/nix-eval-jobs
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
index d19c7a576..01c5a9d5a 100644
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -23,8 +23,8 @@ jobs:
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -37,7 +37,6 @@ jobs:
           --flake
           ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
   nix-build:
-    if: ${{ vars.CACHIX_NAME != '' }}
     strategy:
       fail-fast: false
       matrix:
@@ -51,8 +50,8 @@ jobs:
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -60,7 +59,7 @@ jobs:
       uses: cachix/cachix-action@v13
       with:
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
     - name: Build
       run: >
           nix run github:Mic92/nix-fast-build

From 373ee3fbbabc4c1508eed4f5c3795b23a20939a3 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 22 Feb 2024 19:10:21 +0100
Subject: [PATCH 11/13] Add Gemma chat template (#5665)

* add gemma chat template

* gemma: only apply system_prompt on non-model message
---
 llama.cpp                    | 22 ++++++++++++++++++++++
 tests/test-chat-template.cpp |  4 ++++
 2 files changed, 26 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 6ab5e1bf4..40dda265c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12782,6 +12782,28 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<s>assistant\n";
         }
+    } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
+        // google/gemma-7b-it
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
+                system_prompt = trim(message->content);
+                continue;
+            }
+            // in gemma, "assistant" is "model"
+            role = role == "assistant" ? "model" : message->role;
+            ss << "<start_of_turn>" << role << "\n";
+            if (!system_prompt.empty() && role != "model") {
+                ss << system_prompt << "\n\n";
+                system_prompt = "";
+            }
+            ss << trim(message->content) << "<end_of_turn>\n";
+        }
+        if (add_ass) {
+            ss << "<start_of_turn>model\n";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index d02b39e14..fa2eb577b 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -29,6 +29,8 @@ int main(void) {
         "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
         // mlabonne/AlphaMonarch-7B
         "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
+        // google/gemma-7b-it
+        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -41,6 +43,8 @@ int main(void) {
         "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
         // mlabonne/AlphaMonarch-7B
         "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+        // google/gemma-7b-it
+        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;

From 5a9e2f60ba3d8362ba17c77ac3092906d49b813f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 22 Feb 2024 20:13:25 +0200
Subject: [PATCH 12/13] py : minor fixes (#5668)

---
 convert-hf-to-gguf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 9771fccf9..8630bbf29 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -655,6 +655,8 @@ class OrionModel(Model):
         self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
         self.gguf_writer.add_head_count(head_count)
         self.gguf_writer.add_head_count_kv(head_count_kv)
+        # note: config provides rms norm but it is actually layer norm
+        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
         self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
 
     def write_tensors(self):
@@ -1031,7 +1033,6 @@ class PersimmonModel(Model):
         self.gguf_writer.add_head_count_kv(head_count_kv)
         self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
         self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()

From 201294ae177b308fb3a99dc504dd6d27e8afa907 Mon Sep 17 00:00:00 2001
From: Someone <sergei.kozlukov@aalto.fi>
Date: Thu, 22 Feb 2024 19:44:10 +0000
Subject: [PATCH 13/13] nix: init singularity and docker images (#5056)

Exposes a few attributes demonstrating how to build [singularity](https://docs.sylabs.io/guides/latest/user-guide/)/[apptainer](https://apptainer.org/) and Docker images re-using llama.cpp's Nix expression.

Built locally on `x86_64-linux` with `nix build github:someoneserge/llama.cpp/feat/nix/images#llamaPackages.{docker,docker-min,sif,llama-cpp}` and it's fast and effective.
---
 .devops/nix/docker.nix | 37 +++++++++++++++++++++++++++++++++++++
 .devops/nix/scope.nix  |  3 +++
 .devops/nix/sif.nix    | 27 +++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)
 create mode 100644 .devops/nix/docker.nix
 create mode 100644 .devops/nix/sif.nix

diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix
new file mode 100644
index 000000000..d607b4575
--- /dev/null
+++ b/.devops/nix/docker.nix
@@ -0,0 +1,37 @@
+{
+  lib,
+  dockerTools,
+  buildEnv,
+  llama-cpp,
+  interactive ? true,
+  coreutils,
+}:
+
+# A tar that can be fed into `docker load`:
+#
+# $ nix build .#llamaPackages.docker
+# $ docker load < result
+
+# For details and variations cf.
+# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
+# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
+# - https://nixery.dev/
+
+# Approximate (compressed) sizes, at the time of writing, are:
+#
+# .#llamaPackages.docker: 125M;
+# .#llamaPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+
+dockerTools.buildLayeredImage {
+  name = llama-cpp.pname;
+  tag = "latest";
+
+  contents =
+    [ llama-cpp ]
+    ++ lib.optionals interactive [
+      coreutils
+      dockerTools.binSh
+      dockerTools.caCertificates
+    ];
+}
diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix
index d295995a4..78530c9e8 100644
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -12,5 +12,8 @@ lib.makeScope newScope (
   self: {
     inherit llamaVersion;
     llama-cpp = self.callPackage ./package.nix { };
+    docker = self.callPackage ./docker.nix { };
+    docker-min = self.callPackage ./docker.nix { interactive = false; };
+    sif = self.callPackage ./sif.nix { };
   }
 )
diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix
new file mode 100644
index 000000000..7535ca0f3
--- /dev/null
+++ b/.devops/nix/sif.nix
@@ -0,0 +1,27 @@
+{
+  lib,
+  singularity-tools,
+  llama-cpp,
+  bashInteractive,
+  interactive ? false,
+}:
+
+let
+    optionalInt = cond: x: if cond then x else 0;
+in
+singularity-tools.buildImage rec {
+  inherit (llama-cpp) name;
+  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+
+  # These are excessive (but safe) for most variants. Building singularity
+  # images requires superuser privileges, so we build them inside a VM in a
+  # writable image of pre-determined size.
+  #
+  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
+  #
+  # Expected image sizes:
+  # - cpu/blas: 150M,
+  # - cuda, all gencodes: 560M,
+  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+  memSize = diskSize;
+}