From 5872e4f4da577d0dff43346f4db3d965cb61d437 Mon Sep 17 00:00:00 2001
From: "Wile E. Coyote" <you@example.com>
Date: Sun, 22 Oct 2023 21:45:30 -0400
Subject: [PATCH] server support for system, prefix, and suffix prompts with
 special tokens

---
 examples/server/README.md  |   26 +
 examples/server/server.cpp | 3473 +++++++++++++++++++-----------------
 2 files changed, 1813 insertions(+), 1686 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 715007735..138b87f8c 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -200,6 +200,24 @@ node index.js
 
     `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 
+    `system'      : Set the system prompt added before text prompt (arrays).  It is independent of system_prompt above
+                    and should not be used together with it.
+
+    `input_prefix`: Set the prefix added to input text prompt lines.
+
+    `input_suffix`: Set the suffix added to input text prompt lines.
+
+     The system, input_prefix, and input_suffix are tokenized with special
+     tokens required by some models to work correctly.  Using these three
+     prompts enables the server API to support a full externally accumulated
+     chat history toggling between user inputs and generated outputs line by
+     line with the desired system header, input_prefix, and input_suffix to
+     delineate user and genrated lines, without relying on any context memory
+     in the server.  In order for this to work right, input prompts must
+     not have any hard lfs so the prompt array toggles between user input
+     and generated output every line.  Hard lfs in input prompts need to
+     be replaced with ascii \n sequence or space.
+
 -   **POST** `/tokenize`: Tokenize a given text.
 
     *Options:*
@@ -208,6 +226,14 @@ node index.js
 
     Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
 
+-   **POST** `/tokenizes`: Tokenize a given text with special tokens.
+
+    *Options:*
+
+    `content`: Set the text to tokenize with special tokens.
+
+    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
+
 -   **POST** `/detokenize`: Convert tokens to text.
 
     *Options:*
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c3279dbc9..fc36abbeb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -49,10 +49,10 @@ static bool server_verbose = false;
 #define LOG_VERBOSE(MSG, ...)                                            \
     do                                                                   \
     {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
+	if (server_verbose)                                              \
+	{                                                                \
+	    server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+	}                                                                \
     } while (0)
 #endif
 
@@ -65,9 +65,9 @@ static bool server_verbose = false;
 //
 
 static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
+	     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+	     "abcdefghijklmnopqrstuvwxyz"
+	     "0123456789+/";
 
 static inline bool is_base64(uint8_t c)
 {
@@ -89,46 +89,46 @@ static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
 
     while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
     {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
+	char_array_4[i++] = encoded_string[in_]; in_++;
+	if (i == 4)
+	{
+	    for (i = 0; i <4; i++)
+	    {
+		char_array_4[i] = base64_chars.find(char_array_4[i]);
+	    }
 
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+	    char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+	    char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+	    char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
 
-            for (i = 0; (i < 3); i++)
-            {
-                ret.push_back(char_array_3[i]);
-            }
-            i = 0;
-        }
+	    for (i = 0; (i < 3); i++)
+	    {
+		ret.push_back(char_array_3[i]);
+	    }
+	    i = 0;
+	}
     }
 
     if (i)
     {
-        for (j = i; j <4; j++)
-        {
-            char_array_4[j] = 0;
-        }
+	for (j = i; j <4; j++)
+	{
+	    char_array_4[j] = 0;
+	}
 
-        for (j = 0; j <4; j++)
-        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
+	for (j = 0; j <4; j++)
+	{
+	    char_array_4[j] = base64_chars.find(char_array_4[j]);
+	}
 
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+	char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+	char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+	char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
 
-        for (j = 0; (j < i - 1); j++)
-        {
-            ret.push_back(char_array_3[j]);
-        }
+	for (j = 0; (j < i - 1); j++)
+	{
+	    ret.push_back(char_array_3[j]);
+	}
     }
 
     return ret;
@@ -183,6 +183,7 @@ struct slot_params
 
     std::vector<std::string> antiprompt;
 
+    json system;
     json input_prefix;
     json input_suffix;
 };
@@ -205,8 +206,8 @@ struct completion_token_output
 {
     struct token_prob
     {
-        llama_token tok;
-        float prob;
+	llama_token tok;
+	float prob;
     };
 
     std::vector<token_prob> probs;
@@ -232,26 +233,26 @@ enum stop_type
 static bool ends_with(const std::string &str, const std::string &suffix)
 {
     return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+	   0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
 
 static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
+				       const std::string &text)
 {
     if (!text.empty() && !stop.empty())
     {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
+	const char text_last_char = text.back();
+	for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+	{
+	    if (stop[char_index] == text_last_char)
+	    {
+		const std::string current_partial = stop.substr(0, char_index + 1);
+		if (ends_with(text, current_partial))
+		{
+		    return text.size() - char_index - 1;
+		}
+	    }
+	}
     }
     return std::string::npos;
 }
@@ -263,26 +264,26 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
     std::string ret;
     for (; begin != end; ++begin)
     {
-        ret += llama_token_to_piece(ctx, *begin);
+	ret += llama_token_to_piece(ctx, *begin);
     }
     return ret;
 }
 
 static void server_log(const char *level, const char *function, int line,
-                       const char *message, const nlohmann::ordered_json &extra)
+		       const char *message, const nlohmann::ordered_json &extra)
 {
     nlohmann::ordered_json log
     {
-        {"timestamp", time(nullptr)},
-        {"level",     level},
-        {"function",  function},
-        {"line",      line},
-        {"message",   message},
+	{"timestamp", time(nullptr)},
+	{"level",     level},
+	{"function",  function},
+	{"line",      line},
+	{"message",   message},
     };
 
     if (!extra.empty())
     {
-        log.merge_patch(extra);
+	log.merge_patch(extra);
     }
 
     const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
@@ -298,10 +299,10 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
     //   (size > 1 meaning it's already a known token)
     if (out.size() == 1 && (out[0] & 0x80) == 0x80)
     {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
+	std::stringstream ss;
+	ss << std::hex << (out[0] & 0xff);
+	std::string res(ss.str());
+	out = "byte: \\x" + res;
     }
     return out;
 }
@@ -312,21 +313,21 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
     json out = json::array();
     for (const auto &prob : probs)
     {
-        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
-        {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json{
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
+	json probs_for_token = json::array();
+	for (const auto &p : prob.probs)
+	{
+	    std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+	    probs_for_token.push_back(json
+	    {
+		{"tok_str", tok_str},
+		{"prob",    p.prob},
+	    });
+	}
+	std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+	out.push_back(json{
+	    {"content", tok_str},
+	    {"probs",   probs_for_token},
+	});
     }
     return out;
 }
@@ -336,8 +337,8 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 {
     // Fallback null to default value
     return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
+	? body.value(key, default_value)
+	: default_value;
 }
 
 struct llama_client_slot
@@ -397,92 +398,92 @@ struct llama_client_slot
     double t_token_generation; // ms
 
     void reset() {
-        num_prompt_tokens      = 0;
-        generated_text         = "";
-        truncated              = false;
-        stopped_eos            = false;
-        stopped_word           = false;
-        stopped_limit          = false;
-        stopping_word          = "";
-        multibyte_pending      = 0;
-        n_past                 = 0;
-        sent_count             = 0;
-        sent_token_probs_index = 0;
-        infill                 = false;
+	num_prompt_tokens      = 0;
+	generated_text         = "";
+	truncated              = false;
+	stopped_eos            = false;
+	stopped_word           = false;
+	stopped_limit          = false;
+	stopping_word          = "";
+	multibyte_pending      = 0;
+	n_past                 = 0;
+	sent_count             = 0;
+	sent_token_probs_index = 0;
+	infill                 = false;
 
-        generated_token_probs.clear();
+	generated_token_probs.clear();
 
-        for (slot_image &img : images)
-        {
-            free(img.image_embedding);
-            delete[] img.img_data.data;
-            img.prefix_prompt = "";
-        }
+	for (slot_image &img : images)
+	{
+	    free(img.image_embedding);
+	    delete[] img.img_data.data;
+	    img.prefix_prompt = "";
+	}
 
-        images.clear();
-        // llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
+	images.clear();
+	// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
     }
 
     bool has_budget(gpt_params &global_params) {
-        n_remaining = -1;
-        if(params.n_predict != -1)
-        {
-            n_remaining = params.n_predict - n_decoded;
-        }
-        else if (global_params.n_predict != -1)
-        {
-            n_remaining = global_params.n_predict - n_decoded;
-        }
-        return n_remaining > 0 || n_remaining == -1; // no budget || limitless
+	n_remaining = -1;
+	if(params.n_predict != -1)
+	{
+	    n_remaining = params.n_predict - n_decoded;
+	}
+	else if (global_params.n_predict != -1)
+	{
+	    n_remaining = global_params.n_predict - n_decoded;
+	}
+	return n_remaining > 0 || n_remaining == -1; // no budget || limitless
     }
 
     bool available() const {
-        return state == IDLE && command == NONE;
+	return state == IDLE && command == NONE;
     }
 
     bool is_processing() const {
-        return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
+	return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
     }
 
     void add_token_string(const completion_token_output &token) {
-        if (command == RELEASE)
-        {
-            return;
-        }
-        cache_tokens.push_back(token.tok);
-        generated_token_probs.push_back(token);
+	if (command == RELEASE)
+	{
+	    return;
+	}
+	cache_tokens.push_back(token.tok);
+	generated_token_probs.push_back(token);
     }
 
     void release() {
-        if (state == PROCESSING)
-        {
-            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
-            command = RELEASE;
-        }
+	if (state == PROCESSING)
+	{
+	    t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
+	    command = RELEASE;
+	}
     }
 
     json get_formated_timings() {
-        return json
-        {
-            {"prompt_n",               num_prompt_tokens_processed},
-            {"prompt_ms",              t_prompt_processing},
-            {"prompt_per_token_ms",    t_prompt_processing / num_prompt_tokens_processed},
-            {"prompt_per_second",      1e3 / t_prompt_processing * num_prompt_tokens_processed},
+	return json
+	{
+	    {"prompt_n",               num_prompt_tokens_processed},
+	    {"prompt_ms",              t_prompt_processing},
+	    {"prompt_per_token_ms",    t_prompt_processing / num_prompt_tokens_processed},
+	    {"prompt_per_second",      1e3 / t_prompt_processing * num_prompt_tokens_processed},
 
-            {"predicted_n",            n_decoded},
-            {"predicted_ms",           t_token_generation},
-            {"predicted_per_token_ms", t_token_generation / n_decoded},
-            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
-        };
+	    {"predicted_n",            n_decoded},
+	    {"predicted_ms",           t_token_generation},
+	    {"predicted_per_token_ms", t_token_generation / n_decoded},
+	    {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
+	};
     }
 
     void print_timings() {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
-        LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
-        LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
+	LOG_TEE("\n");
+	LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+	    __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
+	LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+	    __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
+	LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
     }
 };
 
@@ -523,1245 +524,1333 @@ struct llama_server_context
 
     ~llama_server_context()
     {
-        if (ctx)
-        {
-            llama_free(ctx);
-            ctx = nullptr;
-        }
-        if (model)
-        {
-            llama_free_model(model);
-            model = nullptr;
-        }
+	if (ctx)
+	{
+	    llama_free(ctx);
+	    ctx = nullptr;
+	}
+	if (model)
+	{
+	    llama_free_model(model);
+	    model = nullptr;
+	}
     }
 
     bool load_model(const gpt_params &params_)
     {
-        params = params_;
-        if (!params.mmproj.empty()) {
-            multimodal = true;
-            LOG_TEE("Multi Modal Mode Enabled");
-            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
-            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
-                return false;
-            }
+	params = params_;
+	if (!params.mmproj.empty()) {
+	    multimodal = true;
+	    LOG_TEE("Multi Modal Mode Enabled");
+	    clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
+	    if(clp_ctx == nullptr) {
+		LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
+		return false;
+	    }
 
-            if (params.n_ctx < 2048) { // request larger context for the image embedding
-                params.n_ctx = 2048;
-            }
-        }
+	    if (params.n_ctx < 2048) { // request larger context for the image embedding
+		params.n_ctx = 2048;
+	    }
+	}
 
-        std::tie(model, ctx) = llama_init_from_gpt_params(params);
-        if (model == nullptr)
-        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
-            return false;
-        }
+	std::tie(model, ctx) = llama_init_from_gpt_params(params);
+	if (model == nullptr)
+	{
+	    LOG_ERROR("unable to load model", {{"model", params.model}});
+	    return false;
+	}
 
-        if (multimodal) {
-            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_n_embd(model);
-            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
-                llama_free(ctx);
-                llama_free_model(model);
-                return false;
-            }
-        }
+	if (multimodal) {
+	    const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
+	    const int n_embd_llm  = llama_n_embd(model);
+	    if (n_embd_clip != n_embd_llm) {
+		LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+		llama_free(ctx);
+		llama_free_model(model);
+		return false;
+	    }
+	}
 
-        n_ctx = llama_n_ctx(ctx);
+	n_ctx = llama_n_ctx(ctx);
 
-        return true;
+	return true;
     }
 
     void initialize() {
-        id_gen = 0;
+	id_gen = 0;
 
-        // create slots
-        all_slots_are_idle = true;
+	// create slots
+	all_slots_are_idle = true;
 
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+	const int32_t n_ctx_slot = n_ctx / params.n_parallel;
 
-        LOG_TEE("Available slots:\n");
-        for (int i = 0; i < params.n_parallel; i++)
-        {
-            llama_client_slot slot;
+	LOG_TEE("Available slots:\n");
+	for (int i = 0; i < params.n_parallel; i++)
+	{
+	    llama_client_slot slot;
 
-            slot.id = i;
-            slot.n_ctx = n_ctx_slot;
-            slot.reset();
+	    slot.id = i;
+	    slot.n_ctx = n_ctx_slot;
+	    slot.reset();
 
-            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
-            slots.push_back(slot);
-        }
+	    LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+	    slots.push_back(slot);
+	}
 
-        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+	batch = llama_batch_init(n_ctx, 0, params.n_parallel);
 
-        // empty system prompt
-        system_prompt = "";
-        system_tokens.clear();
+	// empty system prompt
+	system_prompt = "";
+	system_tokens.clear();
     }
 
-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
+    std::vector<llama_token> tokenize(const json & json_prompt,
+				      bool add_bos, bool special=false,
+				      const json & json_system=NULL,
+				      const json & json_prefix=NULL,
+				      const json & json_suffix=NULL) const
     {
-        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
-        // or the first element of the json_prompt array is a string.
-        std::vector<llama_token> prompt_tokens;
+	// If `add_bos` is true, we only add BOS, when json_prompt is a string,
+	// or the first element of the json_prompt array is a string.
+	std::vector<llama_token> prompt_tokens;
 
-        if (json_prompt.is_array())
-        {
-            bool first = true;
-            for (const auto& p : json_prompt)
-            {
-                if (p.is_string())
-                {
-                    auto s = p.template get<std::string>();
-                    std::vector<llama_token> p;
-                    if (first)
-                    {
-                        p = ::llama_tokenize(ctx, s, add_bos);
-                        first = false;
-                    }
-                    else
-                    {
-                        p = ::llama_tokenize(ctx, s, false);
-                    }
-                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
-                }
-                else
-                {
-                    if (first)
-                    {
-                        first = false;
-                    }
-                    prompt_tokens.push_back(p.template get<llama_token>());
-                }
-            }
-        }
-        else
-        {
-            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
-        }
+	// to support short term learning from chat context,
+	// first line in array is query, next line
+	// is generated text, next line is next query ...
+	bool user_input = true;
 
-        return prompt_tokens;
+	// don't add sys/prefix/suffix if not a normal tokenize
+	bool add_params = add_bos;
+
+	std::string params_system="",params_input_prefix="",params_input_suffix="";
+	if (json_system != NULL)
+	   if (json_system.is_string())
+	      params_system = json_system.template get<std::string>();
+	if (json_prefix != NULL)
+	   if (json_prefix.is_string())
+	      params_input_prefix = json_prefix.template get<std::string>();
+	if (json_suffix != NULL)
+	   if (json_suffix.is_string())
+	      params_input_suffix = json_suffix.template get<std::string>();
+  
+	if (add_params && (params_system.size() > 1)) {
+	   // add the system prompt before the conversation input
+	   LOG("system: '%s'\n", params_system.c_str());
+	   std::vector<llama_token> system;
+	   system = ::llama_tokenize(ctx,params_system,false,true);
+	   prompt_tokens.insert(prompt_tokens.end(),system.begin(), system.end());
+	   LOG("prompt: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, system).c_str());
+	   }
+ 
+	if (json_prompt.is_array())
+	{
+	    for (const auto& p : json_prompt)
+	    {
+		if (p.is_string())
+		{
+		    auto s = p.template get<std::string>();
+		    std::vector<llama_token> p;
+
+		    if (add_params && user_input && (params_input_prefix.size() > 1)) {
+			LOG("input prefix: '%s'\n", params_input_prefix.c_str());
+			std::vector<llama_token> line_pfx;
+			line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
+			prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
+			LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
+ 
+			// bos has been added
+			add_bos = false;
+			}
+
+		    p = ::llama_tokenize(ctx, s, add_bos, special);
+		    // bos has been added
+		    add_bos = false;
+ 
+		    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
+		    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+ 
+		    if (add_params && user_input && (params_input_suffix.size() > 1)) {
+			LOG("input suffix: '%s'\n", params_input_suffix.c_str());
+			std::vector<llama_token> line_sfx;
+			line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
+			LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
+			prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
+			}
+ 
+		    user_input = !user_input;
+		}
+		else
+		{
+		    prompt_tokens.push_back(p.template get<llama_token>());
+		}
+	    }
+	}
+	else
+	{
+	    auto s = json_prompt.template get<std::string>();
+
+	    std::vector<llama_token> p;
+ 
+	    if (add_params && (params_input_prefix.size() > 1)) {
+	       LOG("input prefix: '%s'\n", params_input_prefix.c_str());
+	       std::vector<llama_token> line_pfx;
+	       line_pfx = ::llama_tokenize(ctx,params_input_prefix,add_bos,true);
+	       prompt_tokens.insert(prompt_tokens.end(),line_pfx.begin(), line_pfx.end());
+	       LOG("prefix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_pfx).c_str());
+	       // bos has been added
+	       add_bos = false;
+	       }
+ 
+	    p = ::llama_tokenize(ctx, s, add_bos, special);
+	    add_bos = false;
+ 
+	    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, p).c_str());
+	    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+ 
+	    // Add the suffix if defined
+	    if (add_params && (params_input_suffix.size() > 1)) {
+	       LOG("input suffix: '%s'\n", params_input_suffix.c_str());
+	       std::vector<llama_token> line_sfx;
+	       line_sfx = ::llama_tokenize(ctx,params_input_suffix,false,true);
+	       LOG("suffix tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_sfx).c_str());
+	       prompt_tokens.insert(prompt_tokens.end(),line_sfx.begin(), line_sfx.end());
+	       }
+	}
+
+	return prompt_tokens;
     }
 
     llama_client_slot* get_slot(int id) {
-        int64_t t_last = ggml_time_us();
-        llama_client_slot *last_used = nullptr;
+	int64_t t_last = ggml_time_us();
+	llama_client_slot *last_used = nullptr;
 
-        for (llama_client_slot & slot : slots)
-        {
-            if (slot.id == id && slot.available())
-            {
-                return &slot;
-            }
+	for (llama_client_slot & slot : slots)
+	{
+	    if (slot.id == id && slot.available())
+	    {
+		return &slot;
+	    }
 
-            if (slot.available() && slot.t_last_used < t_last)
-            {
-                last_used = &slot;
-                t_last = slot.t_last_used;
-            }
-        }
+	    if (slot.available() && slot.t_last_used < t_last)
+	    {
+		last_used = &slot;
+		t_last = slot.t_last_used;
+	    }
+	}
 
-        return last_used;
+	return last_used;
     }
 
     bool launch_slot_with_data(llama_client_slot* &slot, json data) {
-        slot_params default_params;
-        llama_sampling_params default_sparams;
+	slot_params default_params;
+	llama_sampling_params default_sparams;
 
-        slot->params.stream           = json_value(data, "stream",            false);
-        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
-        slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
-        slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
-        slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
-        slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
-        slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
-        slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
-        slot->sparams.penalty_repeat  = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
-        slot->sparams.penalty_freq    = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
-        slot->sparams.penalty_present = json_value(data, "presence_penalty",  default_sparams.penalty_present);
-        slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
-        slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
-        slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
-        slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->params.seed             = json_value(data, "seed",              default_params.seed);
-        slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
-        slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);
+	slot->params.stream           = json_value(data, "stream",            false);
+	slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
+	slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
+	slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
+	slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
+	slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
+	slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
+	slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
+	slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
+	slot->sparams.penalty_repeat  = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
+	slot->sparams.penalty_freq    = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+	slot->sparams.penalty_present = json_value(data, "presence_penalty",  default_sparams.penalty_present);
+	slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
+	slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
+	slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
+	slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
+	slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
+	slot->params.seed             = json_value(data, "seed",              default_params.seed);
+	slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
+	slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);
 
-        // infill
-        if (data.count("input_prefix") != 0)
-        {
-            slot->params.input_prefix = data["input_prefix"];
-        }
-        else
-        {
-            slot->params.input_prefix = "";
-        }
+	// system prompt
+	if (data.count("system") != 0)
+	{
+	    slot->params.system = data["system"];
+	}
+	else
+	{
+	    slot->params.system = "";
+	}
 
-        if (data.count("input_suffix") != 0)
-        {
-            slot->params.input_suffix = data["input_suffix"];
-        }
-        else
-        {
-            slot->params.input_suffix = "";
-        }
+	// infill, prompt prefix/suffix
+	if (data.count("input_prefix") != 0)
+	{
+	    slot->params.input_prefix = data["input_prefix"];
+	}
+	else
+	{
+	    slot->params.input_prefix = "";
+	}
 
-        if (data.count("prompt") != 0)
-        {
-            slot->prompt = data["prompt"];
-        }
-        else
-        {
-            slot->prompt = "";
-        }
+	if (data.count("input_suffix") != 0)
+	{
+	    slot->params.input_suffix = data["input_suffix"];
+	}
+	else
+	{
+	    slot->params.input_suffix = "";
+	}
 
-        slot->sparams.logit_bias.clear();
+	if (data.count("prompt") != 0)
+	{
+	    slot->prompt = data["prompt"];
+	}
+	else
+	{
+	    slot->prompt = "";
+	}
 
-        if (json_value(data, "ignore_eos", false))
-        {
-            slot->sparams.logit_bias[llama_token_eos(ctx)] = -INFINITY;
-        }
+	slot->sparams.logit_bias.clear();
 
-        const auto &logit_bias = data.find("logit_bias");
-        if (logit_bias != data.end() && logit_bias->is_array())
-        {
-            const int n_vocab = llama_n_vocab(model);
-            for (const auto &el : *logit_bias)
-            {
-                if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
-                {
-                    llama_token tok = el[0].get<llama_token>();
-                    if (tok >= 0 && tok < n_vocab)
-                    {
-                        if (el[1].is_number())
-                        {
-                            slot->sparams.logit_bias[tok] = el[1].get<float>();
-                        }
-                        else if (el[1].is_boolean() && !el[1].get<bool>())
-                        {
-                            slot->sparams.logit_bias[tok] = -INFINITY;
-                        }
-                    }
-                }
-            }
-        }
+	if (json_value(data, "ignore_eos", false))
+	{
+	    slot->sparams.logit_bias[llama_token_eos(ctx)] = -INFINITY;
+	}
 
-        slot->params.antiprompt.clear();
-        const auto &stop = data.find("stop");
-        if (stop != data.end() && stop->is_array())
-        {
-            for (const auto &word : *stop)
-            {
-                if (!word.empty())
-                {
-                    slot->params.antiprompt.push_back(word);
-                }
-            }
-        }
+	const auto &logit_bias = data.find("logit_bias");
+	if (logit_bias != data.end() && logit_bias->is_array())
+	{
+	    const int n_vocab = llama_n_vocab(model);
+	    for (const auto &el : *logit_bias)
+	    {
+		if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
+		{
+		    llama_token tok = el[0].get<llama_token>();
+		    if (tok >= 0 && tok < n_vocab)
+		    {
+			if (el[1].is_number())
+			{
+			    slot->sparams.logit_bias[tok] = el[1].get<float>();
+			}
+			else if (el[1].is_boolean() && !el[1].get<bool>())
+			{
+			    slot->sparams.logit_bias[tok] = -INFINITY;
+			}
+		    }
+		}
+	    }
+	}
 
-        if (multimodal)
-        {
-            const auto &images_data = data.find("image_data");
-            if (images_data != data.end() && images_data->is_array())
-            {
-                for (const auto &img : *images_data)
-                {
-                    std::string data_b64 = img["data"].get<std::string>();
-                    slot_image img_sl;
-                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    int width, height, channels;
-                    std::vector<uint8_t> image_buffer = base64_decode(data_b64);
-                    data_b64.clear();
-                    auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
-                    if (!data) {
-                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
-                        return false;
-                    }
-                    LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
-                    img_sl.img_data.nx = width;
-                    img_sl.img_data.ny = height;
-                    img_sl.img_data.size = width * height * 3;
-                    img_sl.img_data.data = new uint8_t[width * height * 3]();
-                    memcpy(img_sl.img_data.data, data, width * height * 3);
-                    stbi_image_free(data);
-                    img_sl.request_encode_image = true;
-                    slot->images.push_back(img_sl);
-                }
-                // process prompt
-                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
-                if (slot->images.size() > 0 && !slot->prompt.is_array())
-                {
-                    std::string prompt = slot->prompt.get<std::string>();
-                    size_t pos = 0, begin_prefix = 0;
-                    std::string pattern = "[img-";
-                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
-                        size_t end_prefix = pos;
-                        pos += pattern.length();
-                        size_t end_pos = prompt.find("]", pos);
-                        if (end_pos != std::string::npos)
-                        {
-                            std::string image_id = prompt.substr(pos, end_pos - pos);
-                            try
-                            {
-                                int img_id = std::stoi(image_id);
-                                bool found = false;
-                                for (slot_image &img : slot->images)
-                                {
-                                    if (img.id == img_id) {
-                                        found = true;
-                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
-                                        begin_prefix = end_pos + 1;
-                                        break;
-                                    }
-                                }
-                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
-                                    slot->images.clear();
-                                    return false;
-                                }
-                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
-                                slot->images.clear();
-                                return false;
-                            }
-                        }
-                    }
-                    slot->prompt = "";
-                    slot->params.input_suffix = prompt.substr(begin_prefix);
-                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
-                }
-            }
-        }
+	slot->params.antiprompt.clear();
+	const auto &stop = data.find("stop");
+	if (stop != data.end() && stop->is_array())
+	{
+	    for (const auto &word : *stop)
+	    {
+		if (!word.empty())
+		{
+		    slot->params.antiprompt.push_back(word);
+		}
+	    }
+	}
 
-        if (slot->ctx_sampling != nullptr)
-        {
-            llama_sampling_free(slot->ctx_sampling);
-        }
-        slot->ctx_sampling = llama_sampling_init(slot->sparams);
-        slot->command = LOAD_PROMPT;
+	if (multimodal)
+	{
+	    const auto &images_data = data.find("image_data");
+	    if (images_data != data.end() && images_data->is_array())
+	    {
+		for (const auto &img : *images_data)
+		{
+		    std::string data_b64 = img["data"].get<std::string>();
+		    slot_image img_sl;
+		    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
+		    int width, height, channels;
+		    std::vector<uint8_t> image_buffer = base64_decode(data_b64);
+		    data_b64.clear();
+		    auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
+		    if (!data) {
+			LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
+			return false;
+		    }
+		    LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
+		    img_sl.img_data.nx = width;
+		    img_sl.img_data.ny = height;
+		    img_sl.img_data.size = width * height * 3;
+		    img_sl.img_data.data = new uint8_t[width * height * 3]();
+		    memcpy(img_sl.img_data.data, data, width * height * 3);
+		    stbi_image_free(data);
+		    img_sl.request_encode_image = true;
+		    slot->images.push_back(img_sl);
+		}
+		// process prompt
+		// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
+		if (slot->images.size() > 0 && !slot->prompt.is_array())
+		{
+		    std::string prompt = slot->prompt.get<std::string>();
+		    size_t pos = 0, begin_prefix = 0;
+		    std::string pattern = "[img-";
+		    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
+			size_t end_prefix = pos;
+			pos += pattern.length();
+			size_t end_pos = prompt.find("]", pos);
+			if (end_pos != std::string::npos)
+			{
+			    std::string image_id = prompt.substr(pos, end_pos - pos);
+			    try
+			    {
+				int img_id = std::stoi(image_id);
+				bool found = false;
+				for (slot_image &img : slot->images)
+				{
+				    if (img.id == img_id) {
+					found = true;
+					img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
+					begin_prefix = end_pos + 1;
+					break;
+				    }
+				}
+				if (!found) {
+				    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+				    slot->images.clear();
+				    return false;
+				}
+			    } catch (const std::invalid_argument& e) {
+				LOG_TEE("Invalid image number id in prompt\n");
+				slot->images.clear();
+				return false;
+			    }
+			}
+		    }
+		    slot->prompt = "";
+		    slot->params.input_suffix = prompt.substr(begin_prefix);
+		    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
+		}
+	    }
+	}
 
-        all_slots_are_idle = false;
+	if (slot->ctx_sampling != nullptr)
+	{
+	    llama_sampling_free(slot->ctx_sampling);
+	}
+	slot->ctx_sampling = llama_sampling_init(slot->sparams);
+	slot->command = LOAD_PROMPT;
 
-        LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+	all_slots_are_idle = false;
 
-        return true;
+	LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+
+	return true;
     }
 
     void kv_cache_clear() {
-        // clear the entire KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
-        clean_kv_cache = false;
+	// clear the entire KV cache
+	llama_kv_cache_tokens_rm(ctx, -1, -1);
+	clean_kv_cache = false;
     }
 
     void update_system_prompt() {
-        system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+	system_tokens = ::llama_tokenize(ctx, system_prompt, true);
 
-        llama_batch_clear(batch);
+	llama_batch_clear(batch);
 
-        kv_cache_clear();
+	kv_cache_clear();
 
-        for (int32_t i = 0; i < batch.n_tokens; ++i)
-        {
-            llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
-        }
+	for (int32_t i = 0; i < batch.n_tokens; ++i)
+	{
+	    llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
+	}
 
-        if (llama_decode(ctx, batch) != 0)
-        {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
-            return;
-        }
+	if (llama_decode(ctx, batch) != 0)
+	{
+	    LOG_TEE("%s: llama_decode() failed\n", __func__);
+	    return;
+	}
 
-        // assign the system KV cache to all parallel sequences
-        for (int32_t i = 1; i < params.n_parallel; ++i)
-        {
-            llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
-        }
+	// assign the system KV cache to all parallel sequences
+	for (int32_t i = 1; i < params.n_parallel; ++i)
+	{
+	    llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+	}
 
-        LOG_TEE("system prompt updated\n");
-        system_need_update = false;
+	LOG_TEE("system prompt updated\n");
+	system_need_update = false;
     }
 
     void notify_system_prompt_changed() {
-        // release all slots
-        for (llama_client_slot &slot : slots)
-        {
-            slot.release();
-        }
-        wait_all_are_idle();
-        all_slots_are_idle = true;
+	// release all slots
+	for (llama_client_slot &slot : slots)
+	{
+	    slot.release();
+	}
+	wait_all_are_idle();
+	all_slots_are_idle = true;
 
-        // wait until system prompt load
-        system_need_update = true;
-        while (system_need_update)
-        {
-            std::this_thread::sleep_for(std::chrono::milliseconds(5));
-        }
-        // system prompt loaded, continue
+	// wait until system prompt load
+	system_need_update = true;
+	while (system_need_update)
+	{
+	    std::this_thread::sleep_for(std::chrono::milliseconds(5));
+	}
+	// system prompt loaded, continue
     }
 
     void process_system_prompt_data(const json &sys_props) {
-        system_prompt  = sys_props.value("prompt", "");
-        name_user      = sys_props.value("anti_prompt", "");
-        name_assistant = sys_props.value("assistant_name", "");
+	system_prompt  = sys_props.value("prompt", "");
+	name_user      = sys_props.value("anti_prompt", "");
+	name_assistant = sys_props.value("assistant_name", "");
 
-        if (slots.size() > 0)
-        {
-            notify_system_prompt_changed();
-        }
-        else
-        {
-            system_need_update = true;
-        }
+	if (slots.size() > 0)
+	{
+	    notify_system_prompt_changed();
+	}
+	else
+	{
+	    system_need_update = true;
+	}
     }
 
     void wait_all_are_idle() {
-        bool wait = true;
-        while (wait)
-        {
-            wait = false;
-            for (auto &slot : slots)
-            {
-                if (!slot.available())
-                {
-                    wait = true;
-                    break;
-                }
-            }
-        }
+	bool wait = true;
+	while (wait)
+	{
+	    wait = false;
+	    for (auto &slot : slots)
+	    {
+		if (!slot.available())
+		{
+		    wait = true;
+		    break;
+		}
+	    }
+	}
     }
 
     static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
-                                        const stop_type type, llama_client_slot &slot)
+					const stop_type type, llama_client_slot &slot)
     {
-        size_t stop_pos = std::string::npos;
+	size_t stop_pos = std::string::npos;
 
-        for (const std::string &word : slot.params.antiprompt)
-        {
-            size_t pos;
-            if (type == STOP_FULL)
-            {
-                const size_t tmp = word.size() + last_token_size;
-                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
-                pos = text.find(word, from_pos);
-            }
-            else
-            {
-                pos = find_partial_stop_string(word, text);
-            }
-            if (pos != std::string::npos &&
-                (stop_pos == std::string::npos || pos < stop_pos))
-            {
-                if (type == STOP_FULL)
-                {
-                    slot.stopped_word = true;
-                    slot.stopping_word = word;
-                    slot.has_next_token = false;
-                }
-                stop_pos = pos;
+	for (const std::string &word : slot.params.antiprompt)
+	{
+	    size_t pos;
+	    if (type == STOP_FULL)
+	    {
+		const size_t tmp = word.size() + last_token_size;
+		const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+		pos = text.find(word, from_pos);
+	    }
+	    else
+	    {
+		pos = find_partial_stop_string(word, text);
+	    }
+	    if (pos != std::string::npos &&
+		(stop_pos == std::string::npos || pos < stop_pos))
+	    {
+		if (type == STOP_FULL)
+		{
+		    slot.stopped_word = true;
+		    slot.stopping_word = word;
+		    slot.has_next_token = false;
+		}
+		stop_pos = pos;
 
-            }
-        }
+	    }
+	}
 
-        return stop_pos;
+	return stop_pos;
     }
 
     bool process_token(completion_token_output &result, llama_client_slot &slot) {
-        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
-        slot.sampled = result.tok;
+	// remember which tokens were sampled - used for repetition penalties during sampling
+	const std::string token_str = llama_token_to_piece(ctx, result.tok);
+	slot.sampled = result.tok;
 
-        // search stop word and delete it
-        slot.generated_text += token_str;
-        slot.has_next_token = true;
+	// search stop word and delete it
+	slot.generated_text += token_str;
+	slot.has_next_token = true;
 
-        if (slot.multibyte_pending > 0)
-        {
-            slot.multibyte_pending -= token_str.size();
-        }
-        else if (token_str.size() == 1)
-        {
-            const char c = token_str[0];
-            // 2-byte characters: 110xxxxx 10xxxxxx
-            if ((c & 0xE0) == 0xC0)
-            {
-                slot.multibyte_pending = 1;
-                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
-            }
-            else if ((c & 0xF0) == 0xE0)
-            {
-                slot.multibyte_pending = 2;
-                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-            }
-            else if ((c & 0xF8) == 0xF0)
-            {
-                slot.multibyte_pending = 3;
-            }
-            else
-            {
-                slot.multibyte_pending = 0;
-            }
-        }
+	if (slot.multibyte_pending > 0)
+	{
+	    slot.multibyte_pending -= token_str.size();
+	}
+	else if (token_str.size() == 1)
+	{
+	    const char c = token_str[0];
+	    // 2-byte characters: 110xxxxx 10xxxxxx
+	    if ((c & 0xE0) == 0xC0)
+	    {
+		slot.multibyte_pending = 1;
+		// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+	    }
+	    else if ((c & 0xF0) == 0xE0)
+	    {
+		slot.multibyte_pending = 2;
+		// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+	    }
+	    else if ((c & 0xF8) == 0xF0)
+	    {
+		slot.multibyte_pending = 3;
+	    }
+	    else
+	    {
+		slot.multibyte_pending = 0;
+	    }
+	}
 
-        if (slot.multibyte_pending == 0)
-        {
-            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
-            const std::string str_test = slot.generated_text.substr(pos);
-            bool is_stop_full = false;
-            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-            if (stop_pos != std::string::npos)
-            {
-                is_stop_full = true;
-                slot.generated_text.erase(
-                    slot.generated_text.begin() + pos + stop_pos,
-                    slot.generated_text.end());
-                pos = std::min(slot.sent_count, slot.generated_text.size());
-            }
-            else
-            {
-                is_stop_full = false;
-                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
-            }
+	if (slot.multibyte_pending == 0)
+	{
+	    size_t pos = std::min(slot.sent_count, slot.generated_text.size());
+	    const std::string str_test = slot.generated_text.substr(pos);
+	    bool is_stop_full = false;
+	    size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+	    if (stop_pos != std::string::npos)
+	    {
+		is_stop_full = true;
+		slot.generated_text.erase(
+		    slot.generated_text.begin() + pos + stop_pos,
+		    slot.generated_text.end());
+		pos = std::min(slot.sent_count, slot.generated_text.size());
+	    }
+	    else
+	    {
+		is_stop_full = false;
+		stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+	    }
 
-            // check if there is any token to predict
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
-            {
-                // no send the stop word in the response
-                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                slot.sent_count += result.text_to_send.size();
-                // add the token to slot queue and cache
-            }
-            slot.add_token_string(result);
-            if (slot.params.stream)
-            {
-                send_partial_response(slot, result);
-            }
-        }
+	    // check if there is any token to predict
+	    if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
+	    {
+		// no send the stop word in the response
+		result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+		slot.sent_count += result.text_to_send.size();
+		// add the token to slot queue and cache
+	    }
+	    slot.add_token_string(result);
+	    if (slot.params.stream)
+	    {
+		send_partial_response(slot, result);
+	    }
+	}
 
-        if (slot.multibyte_pending > 0 && !slot.has_next_token)
-        {
-            slot.has_next_token = true;
-        }
+	if (slot.multibyte_pending > 0 && !slot.has_next_token)
+	{
+	    slot.has_next_token = true;
+	}
 
-        // check the limits
-        if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params))
-        {
-            slot.stopped_limit = true;
-            slot.has_next_token = false;
-        }
+	// check the limits
+	if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params))
+	{
+	    slot.stopped_limit = true;
+	    slot.has_next_token = false;
+	}
 
-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx))
-        {
-            slot.stopped_eos = true;
-            slot.has_next_token = false;
-            LOG_VERBOSE("eos token found", {});
-        }
+	if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx))
+	{
+	    slot.stopped_eos = true;
+	    slot.has_next_token = false;
+	    LOG_VERBOSE("eos token found", {});
+	}
 
-        LOG_VERBOSE("next token", {
-                                      {"token", result.tok},
-                                      {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
-                                      {"has_next_token", slot.has_next_token},
-                                      {"n_remain", slot.n_remaining},
-                                      {"num_tokens_predicted", slot.n_decoded},
-                                      {"stopped_eos", slot.stopped_eos},
-                                      {"stopped_word", slot.stopped_word},
-                                      {"stopped_limit", slot.stopped_limit},
-                                      {"stopping_word", slot.stopping_word},
-                                  });
+	LOG_VERBOSE("next token", {
+				      {"token", result.tok},
+				      {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
+				      {"has_next_token", slot.has_next_token},
+				      {"n_remain", slot.n_remaining},
+				      {"num_tokens_predicted", slot.n_decoded},
+				      {"stopped_eos", slot.stopped_eos},
+				      {"stopped_word", slot.stopped_word},
+				      {"stopped_limit", slot.stopped_limit},
+				      {"stopping_word", slot.stopping_word},
+				  });
 
-        return slot.has_next_token; // continue
+	return slot.has_next_token; // continue
     }
 
     bool process_images(llama_client_slot &slot) const
     {
-        for (slot_image &img : slot.images)
-        {
-            if (!img.request_encode_image)
-            {
-                continue;
-            }
-            clip_image_f32 img_res;
-            if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
-            {
-                LOG_TEE("Error processing the given image");
-                clip_free(clp_ctx);
-                return false;
-            }
-            img.image_tokens = clip_n_patches(clp_ctx);
-            img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
-            if (!img.image_embedding)
-            {
-                LOG_TEE("Unable to allocate memory for image embeddings\n");
-                clip_free(clp_ctx);
-                return false;
-            }
-            LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
-            if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
-            {
-                LOG_TEE("Unable to encode image\n");
-                return false;
-            }
-            img.request_encode_image = false;
-        }
+	for (slot_image &img : slot.images)
+	{
+	    if (!img.request_encode_image)
+	    {
+		continue;
+	    }
+	    clip_image_f32 img_res;
+	    if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
+	    {
+		LOG_TEE("Error processing the given image");
+		clip_free(clp_ctx);
+		return false;
+	    }
+	    img.image_tokens = clip_n_patches(clp_ctx);
+	    img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
+	    if (!img.image_embedding)
+	    {
+		LOG_TEE("Unable to allocate memory for image embeddings\n");
+		clip_free(clp_ctx);
+		return false;
+	    }
+	    LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
+	    if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
+	    {
+		LOG_TEE("Unable to encode image\n");
+		return false;
+	    }
+	    img.request_encode_image = false;
+	}
 
-        return slot.images.size() > 0;
+	return slot.images.size() > 0;
     }
 
     void send_error(int id, std::string error)
     {
-        std::lock_guard<std::mutex> lock(mutex_results);
-        task_result res;
-        res.id = id;
-        res.error = true;
-        res.result_json = { { "content", error } };
-        queue_results.push_back(res);
+	std::lock_guard<std::mutex> lock(mutex_results);
+	task_result res;
+	res.id = id;
+	res.error = true;
+	res.result_json = { { "content", error } };
+	queue_results.push_back(res);
     }
 
     json get_model_props()
     {
-        return get_formated_generation(slots[0]);
+	return get_formated_generation(slots[0]);
     }
 
     json get_formated_generation(llama_client_slot &slot)
     {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(ctx));
-        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
-                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
-        return json {
-            {"n_ctx",             slot.n_ctx},
-            {"model",             params.model_alias},
-            {"seed",              slot.params.seed},
-            {"temp",              slot.sparams.temp},
-            {"top_k",             slot.sparams.top_k},
-            {"top_p",             slot.sparams.top_p},
-            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typical_p},
-            {"repeat_last_n",     slot.sparams.penalty_last_n},
-            {"repeat_penalty",    slot.sparams.penalty_repeat},
-            {"presence_penalty",  slot.sparams.penalty_present},
-            {"frequency_penalty", slot.sparams.penalty_freq},
-            {"mirostat",          slot.sparams.mirostat},
-            {"mirostat_tau",      slot.sparams.mirostat_tau},
-            {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
-            {"stop",              slot.params.antiprompt},
-            {"n_predict",         slot.params.n_predict},
-            {"n_keep",            params.n_keep},
-            {"ignore_eos",        ignore_eos},
-            {"stream",            slot.params.stream},
-            {"logit_bias",        slot.sparams.logit_bias},
-            {"n_probs",           slot.sparams.n_probs},
-            {"grammar",           slot.sparams.grammar},
-        };
+	const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(ctx));
+	const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
+				eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+	return json {
+	    {"n_ctx",             slot.n_ctx},
+	    {"model",             params.model_alias},
+	    {"seed",              slot.params.seed},
+	    {"temp",              slot.sparams.temp},
+	    {"top_k",             slot.sparams.top_k},
+	    {"top_p",             slot.sparams.top_p},
+	    {"tfs_z",             slot.sparams.tfs_z},
+	    {"typical_p",         slot.sparams.typical_p},
+	    {"repeat_last_n",     slot.sparams.penalty_last_n},
+	    {"repeat_penalty",    slot.sparams.penalty_repeat},
+	    {"presence_penalty",  slot.sparams.penalty_present},
+	    {"frequency_penalty", slot.sparams.penalty_freq},
+	    {"mirostat",          slot.sparams.mirostat},
+	    {"mirostat_tau",      slot.sparams.mirostat_tau},
+	    {"mirostat_eta",      slot.sparams.mirostat_eta},
+	    {"penalize_nl",       slot.sparams.penalize_nl},
+	    {"system",            slot.params.system},
+	    {"input_prefix",      slot.params.input_prefix},
+	    {"input_suffix",      slot.params.input_suffix},
+	    {"stop",              slot.params.antiprompt},
+	    {"n_predict",         slot.params.n_predict},
+	    {"n_keep",            params.n_keep},
+	    {"ignore_eos",        ignore_eos},
+	    {"stream",            slot.params.stream},
+	    {"logit_bias",        slot.sparams.logit_bias},
+	    {"n_probs",           slot.sparams.n_probs},
+	    {"grammar",           slot.sparams.grammar},
+	};
     }
 
     void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
     {
-        std::lock_guard<std::mutex> lock(mutex_results);
-        task_result res;
-        res.id = slot.task_id;
-        res.error = false;
-        res.stop = false;
+	std::lock_guard<std::mutex> lock(mutex_results);
+	task_result res;
+	res.id = slot.task_id;
+	res.error = false;
+	res.stop = false;
 
-        res.result_json = json
-        {
-            {"content",    tkn.text_to_send},
-            {"stop",       false},
-            {"slot_id",    slot.id},
-            {"multimodal", multimodal}
-        };
+	res.result_json = json
+	{
+	    {"content",    tkn.text_to_send},
+	    {"stop",       false},
+	    {"slot_id",    slot.id},
+	    {"multimodal", multimodal}
+	};
 
-        if (slot.sparams.n_probs > 0)
-        {
-            std::vector<completion_token_output> probs_output = {};
-            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
-            size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
-            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
-            if (probs_pos < probs_stop_pos)
-            {
-                probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
-            }
-            slot.sent_token_probs_index = probs_stop_pos;
-            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
-        }
+	if (slot.sparams.n_probs > 0)
+	{
+	    std::vector<completion_token_output> probs_output = {};
+	    const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
+	    size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
+	    size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
+	    if (probs_pos < probs_stop_pos)
+	    {
+		probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
+	    }
+	    slot.sent_token_probs_index = probs_stop_pos;
+	    res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
+	}
 
-        queue_results.push_back(res);
+	queue_results.push_back(res);
     }
 
     void send_final_response(llama_client_slot &slot)
     {
-        std::lock_guard<std::mutex> lock(mutex_results);
-        task_result res;
-        res.id = slot.task_id;
-        res.error = false;
-        res.stop = true;
+	std::lock_guard<std::mutex> lock(mutex_results);
+	task_result res;
+	res.id = slot.task_id;
+	res.error = false;
+	res.stop = true;
 
-        res.result_json = json
-        {
-            {"content",             !slot.params.stream ? slot.generated_text : ""},
-            {"slot_id",             slot.id},
-            {"stop",                true},
-            {"model",               params.model_alias},
-            {"tokens_predicted",    slot.n_decoded},
-            {"tokens_evaluated",    slot.num_prompt_tokens},
-            {"generation_settings", get_formated_generation(slot)},
-            {"prompt",              slot.prompt},
-            {"truncated",           slot.truncated},
-            {"stopped_eos",         slot.stopped_eos},
-            {"stopped_word",        slot.stopped_word},
-            {"stopped_limit",       slot.stopped_limit},
-            {"stopping_word",       slot.stopping_word},
-            {"tokens_cached",       slot.n_past},
-            {"timings",             slot.get_formated_timings()}
-        };
+	res.result_json = json
+	{
+	    {"content",             !slot.params.stream ? slot.generated_text : ""},
+	    {"slot_id",             slot.id},
+	    {"stop",                true},
+	    {"model",               params.model_alias},
+	    {"tokens_predicted",    slot.n_decoded},
+	    {"tokens_evaluated",    slot.num_prompt_tokens},
+	    {"generation_settings", get_formated_generation(slot)},
+	    {"prompt",              slot.prompt},
+	    {"truncated",           slot.truncated},
+	    {"stopped_eos",         slot.stopped_eos},
+	    {"stopped_word",        slot.stopped_word},
+	    {"stopped_limit",       slot.stopped_limit},
+	    {"stopping_word",       slot.stopping_word},
+	    {"tokens_cached",       slot.n_past},
+	    {"timings",             slot.get_formated_timings()}
+	};
 
-        if (slot.sparams.n_probs > 0)
-        {
-            std::vector<completion_token_output> probs = {};
-            if (!slot.params.stream && slot.stopped_word)
-            {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
-                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
-            }
-            else
-            {
-                probs = std::vector<completion_token_output>(
-                                    slot.generated_token_probs.begin(),
-                                    slot.generated_token_probs.begin() + slot.sent_token_probs_index);
-            }
-            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
-        }
+	if (slot.sparams.n_probs > 0)
+	{
+	    std::vector<completion_token_output> probs = {};
+	    if (!slot.params.stream && slot.stopped_word)
+	    {
+		const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
+		probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
+	    }
+	    else
+	    {
+		probs = std::vector<completion_token_output>(
+				    slot.generated_token_probs.begin(),
+				    slot.generated_token_probs.begin() + slot.sent_token_probs_index);
+	    }
+	    res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
+	}
 
-        queue_results.push_back(res);
+	queue_results.push_back(res);
     }
 
     void send_embedding(llama_client_slot &slot)
     {
-        std::lock_guard<std::mutex> lock(mutex_results);
-        task_result res;
-        res.id = slot.task_id;
-        res.error = false;
-        res.stop = true;
+	std::lock_guard<std::mutex> lock(mutex_results);
+	task_result res;
+	res.id = slot.task_id;
+	res.error = false;
+	res.stop = true;
 
-        const int n_embd = llama_n_embd(model);
-        if (!params.embedding)
-        {
-            LOG_WARNING("embedding disabled", {
-                                                  {"params.embedding", params.embedding},
-                                              });
-            res.result_json = json
-            {
-                {"embedding", std::vector<float>(n_embd, 0.0f)},
-            };
-        }
-        else
-        {
-            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embedding(data, data + n_embd);
-            res.result_json = json
-            {
-                {"embedding", embedding },
-            };
-        }
-        queue_results.push_back(res);
+	const int n_embd = llama_n_embd(model);
+	if (!params.embedding)
+	{
+	    LOG_WARNING("embedding disabled", {
+						  {"params.embedding", params.embedding},
+					      });
+	    res.result_json = json
+	    {
+		{"embedding", std::vector<float>(n_embd, 0.0f)},
+	    };
+	}
+	else
+	{
+	    const float *data = llama_get_embeddings(ctx);
+	    std::vector<float> embedding(data, data + n_embd);
+	    res.result_json = json
+	    {
+		{"embedding", embedding },
+	    };
+	}
+	queue_results.push_back(res);
     }
 
     int request_completion(json data, bool infill)
     {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_server task;
-        task.id = id_gen++;
-        task.data = data;
-        task.infill_mode = infill;
-        task.type = COMPLETION_TASK;
-        queue_tasks.push_back(task);
-        return task.id;
+	std::lock_guard<std::mutex> lock(mutex_tasks);
+	task_server task;
+	task.id = id_gen++;
+	task.data = data;
+	task.infill_mode = infill;
+	task.type = COMPLETION_TASK;
+	queue_tasks.push_back(task);
+	return task.id;
     }
 
     task_result next_result(int task_id)
     {
-        while (true)
-        {
-            std::this_thread::sleep_for(std::chrono::microseconds(5));
-            std::lock_guard<std::mutex> lock(mutex_results);
+	while (true)
+	{
+	    std::this_thread::sleep_for(std::chrono::microseconds(5));
+	    std::lock_guard<std::mutex> lock(mutex_results);
 
-            if (queue_results.empty())
-            {
-                continue;
-            }
+	    if (queue_results.empty())
+	    {
+		continue;
+	    }
 
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                if (queue_results[i].id == task_id)
-                {
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
+	    for (int i = 0; i < (int) queue_results.size(); i++)
+	    {
+		if (queue_results[i].id == task_id)
+		{
+		    task_result res = queue_results[i];
+		    queue_results.erase(queue_results.begin() + i);
+		    return res;
+		}
+	    }
+	}
 
-        // never reached
-        //return task_result{-1, false, false, {}};
+	// never reached
+	//return task_result{-1, false, false, {}};
     }
 
     // for multiple images processing
     bool ingest_images(llama_client_slot &slot, int n_batch)
     {
-        int image_idx = 0;
+	int image_idx = 0;
 
-        while (image_idx < (int) slot.images.size())
-        {
-            slot_image &img = slot.images[image_idx];
+	while (image_idx < (int) slot.images.size())
+	{
+	    slot_image &img = slot.images[image_idx];
 
-            // process prefix prompt
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-            {
-                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                    0, 0, 0, // unused
-                };
-                if (llama_decode(ctx, batch_view))
-                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
-                    return false;
-                }
-            }
+	    // process prefix prompt
+	    for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+	    {
+		const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+		llama_batch batch_view = {
+		    n_tokens,
+		    batch.token    + i,
+		    nullptr,
+		    batch.pos      + i,
+		    batch.n_seq_id + i,
+		    batch.seq_id   + i,
+		    batch.logits   + i,
+		    0, 0, 0, // unused
+		};
+		if (llama_decode(ctx, batch_view))
+		{
+		    LOG_TEE("%s : failed to eval\n", __func__);
+		    return false;
+		}
+	    }
 
-            // process image with llm
-            for (int i = 0; i < img.image_tokens; i += n_batch)
-            {
-                int n_eval = img.image_tokens - i;
-                if (n_eval > n_batch)
-                {
-                    n_eval = n_batch;
-                }
+	    // process image with llm
+	    for (int i = 0; i < img.image_tokens; i += n_batch)
+	    {
+		int n_eval = img.image_tokens - i;
+		if (n_eval > n_batch)
+		{
+		    n_eval = n_batch;
+		}
 
-                const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
-                if (llama_decode(ctx, batch_img))
-                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
-                    return false;
-                }
-                slot.n_past += n_eval;
-            }
-            image_idx++;
+		const int n_embd = llama_n_embd(model);
+		llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
+		if (llama_decode(ctx, batch_img))
+		{
+		    LOG_TEE("%s : failed to eval image\n", __func__);
+		    return false;
+		}
+		slot.n_past += n_eval;
+	    }
+	    image_idx++;
 
-            llama_batch_clear(batch);
+	    llama_batch_clear(batch);
 
-            // append prefix of next image
-            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
-                slot.params.input_suffix : // no more images, then process suffix prompt
-                (json)(slot.images[image_idx].prefix_prompt);
+	    // append prefix of next image
+	    const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
+		slot.params.input_suffix : // no more images, then process suffix prompt
+		(json)(slot.images[image_idx].prefix_prompt);
 
-            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
-            for (int i = 0; i < (int) append_tokens.size(); ++i)
-            {
-                llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
-                slot.n_past += 1;
-            }
-        }
+	    std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
+	    for (int i = 0; i < (int) append_tokens.size(); ++i)
+	    {
+		llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
+		slot.n_past += 1;
+	    }
+	}
 
-        return true;
+	return true;
     }
 
     void request_cancel(int task_id)
     {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_server task;
-        task.id = id_gen++;
-        task.type = CANCEL_TASK;
-        task.target_id = task_id;
-        queue_tasks.push_back(task);
+	std::lock_guard<std::mutex> lock(mutex_tasks);
+	task_server task;
+	task.id = id_gen++;
+	task.type = CANCEL_TASK;
+	task.target_id = task_id;
+	queue_tasks.push_back(task);
     }
 
     void process_tasks()
     {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        while (!queue_tasks.empty())
-        {
-            task_server task = queue_tasks.front();
-            queue_tasks.erase(queue_tasks.begin());
-            switch (task.type)
-            {
-                case COMPLETION_TASK: {
-                    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
-                    if (slot == nullptr)
-                    {
-                        LOG_TEE("slot unavailable\n");
-                        // send error result
-                        send_error(task.id, "slot unavaliable");
-                        return;
-                    }
+	std::lock_guard<std::mutex> lock(mutex_tasks);
+	while (!queue_tasks.empty())
+	{
+	    task_server task = queue_tasks.front();
+	    queue_tasks.erase(queue_tasks.begin());
+	    switch (task.type)
+	    {
+		case COMPLETION_TASK: {
+		    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
+		    if (slot == nullptr)
+		    {
+			LOG_TEE("slot unavailable\n");
+			// send error result
+			send_error(task.id, "slot unavaliable");
+			return;
+		    }
 
-                    if (task.data.contains("system_prompt"))
-                    {
-                        process_system_prompt_data(task.data["system_prompt"]);
-                    }
+		    if (task.data.contains("system_prompt"))
+		    {
+			process_system_prompt_data(task.data["system_prompt"]);
+		    }
 
-                    slot->reset();
+		    slot->reset();
 
-                    slot->infill = task.infill_mode;
-                    slot->task_id = task.id;
+		    slot->infill = task.infill_mode;
+		    slot->task_id = task.id;
 
-                    if (!launch_slot_with_data(slot, task.data))
-                    {
-                        // send error result
-                        send_error(task.id, "internal_error");
-                        break;
-                    }
-                } break;
-                case CANCEL_TASK: { // release slot linked with the task id
-                    for (auto & slot : slots)
-                    {
-                        if (slot.task_id == task.target_id)
-                        {
-                            slot.release();
-                            break;
-                        }
-                    }
-                } break;
-            }
-        }
+		    if (!launch_slot_with_data(slot, task.data))
+		    {
+			// send error result
+			send_error(task.id, "internal_error");
+			break;
+		    }
+		} break;
+		case CANCEL_TASK: { // release slot linked with the task id
+		    for (auto & slot : slots)
+		    {
+			if (slot.task_id == task.target_id)
+			{
+			    slot.release();
+			    break;
+			}
+		    }
+		} break;
+	    }
+	}
     }
 
     bool update_slots() {
-        // attend tasks
-        process_tasks();
+	// attend tasks
+	process_tasks();
 
-        // update the system prompt wait until all slots are idle state
-        if (system_need_update)
-        {
-            LOG_TEE("updating system prompt\n");
-            update_system_prompt();
-        }
+	// update the system prompt wait until all slots are idle state
+	if (system_need_update)
+	{
+	    LOG_TEE("updating system prompt\n");
+	    update_system_prompt();
+	}
 
-        llama_batch_clear(batch);
+	llama_batch_clear(batch);
 
-        if (all_slots_are_idle)
-        {
-            if (system_prompt.empty() && clean_kv_cache)
-            {
-                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
-                kv_cache_clear();
-            }
-            // avoid 100% usage of cpu all time
-            std::this_thread::sleep_for(std::chrono::milliseconds(5));
-        }
+	if (all_slots_are_idle)
+	{
+	    if (system_prompt.empty() && clean_kv_cache)
+	    {
+		LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
+		kv_cache_clear();
+	    }
+	    // avoid 100% usage of cpu all time
+	    std::this_thread::sleep_for(std::chrono::milliseconds(5));
+	}
 
-        for (llama_client_slot &slot : slots)
-        {
-            if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
-            {
-                // Shift context
-                const int n_left    = slot.n_past - slot.params.n_keep - 1;
-                const int n_discard = n_left / 2;
+	for (llama_client_slot &slot : slots)
+	{
+	    if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
+	    {
+		// Shift context
+		const int n_left    = slot.n_past - slot.params.n_keep - 1;
+		const int n_discard = n_left / 2;
 
-                LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
-                llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
+		LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
+		llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
+		llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
 
-                for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
-                {
-                    slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-                }
+		for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
+		{
+		    slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+		}
 
-                slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+		slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
 
-                slot.n_past -= n_discard;
+		slot.n_past -= n_discard;
 
-                slot.truncated = true;
+		slot.truncated = true;
 
-                LOG_VERBOSE("context shift", {
-                                                {"n_ctx",  n_ctx},
-                                                {"n_keep", params.n_keep},
-                                                {"n_left", n_left},
-                                            });
-            }
-        }
+		LOG_VERBOSE("context shift", {
+						{"n_ctx",  n_ctx},
+						{"n_keep", params.n_keep},
+						{"n_left", n_left},
+					    });
+	    }
+	}
 
-        // decode any currently ongoing sequences
-        for (auto & slot : slots)
-        {
-            // release the slot
-            if (slot.state == PROCESSING && slot.command == RELEASE)
-            {
-                slot.state = IDLE;
-                slot.command = NONE;
-                slot.t_last_used = ggml_time_us();
+	// decode any currently ongoing sequences
+	for (auto & slot : slots)
+	{
+	    // release the slot
+	    if (slot.state == PROCESSING && slot.command == RELEASE)
+	    {
+		slot.state = IDLE;
+		slot.command = NONE;
+		slot.t_last_used = ggml_time_us();
 
-                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+		LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
 
-                continue;
-            }
+		continue;
+	    }
 
-            if (slot.state == IDLE || slot.command == RELEASE)
-            {
-                continue;
-            }
+	    if (slot.state == IDLE || slot.command == RELEASE)
+	    {
+		continue;
+	    }
 
-            slot.i_batch = batch.n_tokens;
+	    slot.i_batch = batch.n_tokens;
 
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);
+	    llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);
 
-            slot.n_decoded += 1;
-            slot.n_past += 1;
-        }
+	    slot.n_decoded += 1;
+	    slot.n_past += 1;
+	}
 
-        // process in chunks of params.n_batch
-        int32_t n_batch = params.n_batch;
+	// process in chunks of params.n_batch
+	int32_t n_batch = params.n_batch;
 
-        // assign workload to the slots
-        if (params.cont_batching || batch.n_tokens == 0)
-        {
-            for (auto & slot : slots)
-            {
-                // need process the prompt
-                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
-                {
-                    slot.state = PROCESSING;
-                    slot.command = NONE;
-                    std::vector<llama_token> prompt_tokens;
-                    slot.t_start_process_prompt = ggml_time_us();
-                    slot.t_start_genereration = 0;
+	// assign workload to the slots
+	if (params.cont_batching || batch.n_tokens == 0)
+	{
+	    for (auto & slot : slots)
+	    {
+		// need process the prompt
+		if (slot.state == IDLE && slot.command == LOAD_PROMPT)
+		{
+		    slot.state = PROCESSING;
+		    slot.command = NONE;
+		    std::vector<llama_token> prompt_tokens;
+		    slot.t_start_process_prompt = ggml_time_us();
+		    slot.t_start_genereration = 0;
 
-                    if (slot.infill)
-                    {
-                        bool suff_rm_leading_spc = true;
-                        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
-                        {
-                            params.input_suffix.erase(0, 1);
-                            suff_rm_leading_spc = false;
-                        }
-                        auto prefix_tokens = tokenize(slot.params.input_prefix, false);
-                        auto suffix_tokens = tokenize(slot.params.input_suffix, false);
+		    if (slot.infill)
+		    {
+			bool suff_rm_leading_spc = true;
+			if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
+			{
+			    params.input_suffix.erase(0, 1);
+			    suff_rm_leading_spc = false;
+			}
+			auto prefix_tokens = tokenize(slot.params.input_prefix, false);
+			auto suffix_tokens = tokenize(slot.params.input_suffix, false);
 
-                        const int space_token = 29871; // TODO: this should not be hardcoded
-                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
-                            suffix_tokens.erase(suffix_tokens.begin());
-                        }
+			const int space_token = 29871; // TODO: this should not be hardcoded
+			if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
+			    suffix_tokens.erase(suffix_tokens.begin());
+			}
 
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
-                        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_token_middle(ctx));
-                        prompt_tokens = prefix_tokens;
-                    }
-                    else
-                    {
-                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt
-                    }
+			prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
+			prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
+			prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
+			prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
+			prefix_tokens.push_back(llama_token_middle(ctx));
+			prompt_tokens = prefix_tokens;
+		    }
+		    else
+		    {
+			prompt_tokens = tokenize(slot.prompt, system_prompt.empty(),
+						 false,
+						 slot.params.system,
+						 slot.params.input_prefix,
+						 slot.params.input_suffix);  // add BOS if there isn't system prompt
+		    }
 
-                    slot.num_prompt_tokens = prompt_tokens.size();
+		    slot.num_prompt_tokens = prompt_tokens.size();
 
-                    if (!slot.params.cache_prompt)
-                    {
-                        llama_sampling_reset(slot.ctx_sampling);
+		    if (!slot.params.cache_prompt)
+		    {
+			llama_sampling_reset(slot.ctx_sampling);
 
-                        slot.n_past = 0;
-                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
-                    }
-                    else
-                    {
-                        if (slot.params.n_keep < 0)
-                        {
-                            slot.params.n_keep = slot.num_prompt_tokens;
-                        }
-                        slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
+			slot.n_past = 0;
+			slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
+		    }
+		    else
+		    {
+			if (slot.params.n_keep < 0)
+			{
+			    slot.params.n_keep = slot.num_prompt_tokens;
+			}
+			slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
-                        // if input prompt is too big, truncate it
-                        if (slot.num_prompt_tokens >= slot.n_ctx)
-                        {
-                            const int n_left = slot.n_ctx - slot.params.n_keep;
-                            const int n_block_size = n_left / 2;
-                            const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+			// if input prompt is too big, truncate it
+			if (slot.num_prompt_tokens >= slot.n_ctx)
+			{
+			    const int n_left = slot.n_ctx - slot.params.n_keep;
+			    const int n_block_size = n_left / 2;
+			    const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
-                            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
-                            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+			    std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
+			    new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
 
-                            LOG_VERBOSE("input truncated", {
-                                                            {"n_ctx",  slot.n_ctx},
-                                                            {"n_keep", slot.params.n_keep},
-                                                            {"n_left", n_left},
-                                                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                                                        });
-                            slot.truncated = true;
-                            prompt_tokens = new_tokens;
+			    LOG_VERBOSE("input truncated", {
+							    {"n_ctx",  slot.n_ctx},
+							    {"n_keep", slot.params.n_keep},
+							    {"n_left", n_left},
+							    {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+							});
+			    slot.truncated = true;
+			    prompt_tokens = new_tokens;
 
-                            slot.num_prompt_tokens = prompt_tokens.size();
-                            GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
-                        }
+			    slot.num_prompt_tokens = prompt_tokens.size();
+			    GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
+			}
 
-                        // push the prompt into the sampling context (do not apply grammar)
-                        for (auto &token : prompt_tokens)
-                        {
-                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
-                        }
+			// push the prompt into the sampling context (do not apply grammar)
+			for (auto &token : prompt_tokens)
+			{
+			    llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
+			}
 
-                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
-                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
+			slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+			slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
 
-                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
-                    }
+			LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+		    }
 
-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+		    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
 
-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+		    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
 
-                    slot.cache_tokens = prompt_tokens;
+		    slot.cache_tokens = prompt_tokens;
 
-                    if (slot.n_past == slot.num_prompt_tokens)
-                    {
-                        // we have to evaluate at least 1 token to generate logits.
-                        LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
-                        slot.n_past--;
-                    }
+		    if (slot.n_past == slot.num_prompt_tokens)
+		    {
+			// we have to evaluate at least 1 token to generate logits.
+			LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
+			slot.n_past--;
+		    }
 
-                    LOG_VERBOSE("prompt ingested", {
-                                                    {"n_past", slot.n_past},
-                                                    {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
-                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
-                                                });
+		    LOG_VERBOSE("prompt ingested", {
+						    {"n_past", slot.n_past},
+						    {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
+						    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
+						});
 
-                    const bool has_images = process_images(slot);
+		    const bool has_images = process_images(slot);
 
-                    // process the prefix of first image
-                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
-                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
-                    {
-                       llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
-                    }
+		    // process the prefix of first image
+		    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
+		    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
+		    {
+		       llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
+		    }
 
-                    if (has_images && !ingest_images(slot, n_batch))
-                    {
-                        LOG_TEE("failed processing images\n");
-                        return false;
-                    }
+		    if (has_images && !ingest_images(slot, n_batch))
+		    {
+			LOG_TEE("failed processing images\n");
+			return false;
+		    }
 
-                    // extract the logits only for the last token
-                    if (batch.n_tokens > 0)
-                    {
-                        batch.logits[batch.n_tokens - 1] = true;
-                    }
+		    // extract the logits only for the last token
+		    if (batch.n_tokens > 0)
+		    {
+			batch.logits[batch.n_tokens - 1] = true;
+		    }
 
-                    slot.n_decoded = 0;
-                    slot.i_batch   = batch.n_tokens - 1;
-                }
-            }
-        }
+		    slot.n_decoded = 0;
+		    slot.i_batch   = batch.n_tokens - 1;
+		}
+	    }
+	}
 
-        if (batch.n_tokens == 0)
-        {
-            all_slots_are_idle = true;
-            return true;
-        }
+	if (batch.n_tokens == 0)
+	{
+	    all_slots_are_idle = true;
+	    return true;
+	}
 
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-        {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-            llama_batch batch_view =
-            {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
+	for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+	{
+	    const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+	    llama_batch batch_view =
+	    {
+		n_tokens,
+		batch.token    + i,
+		nullptr,
+		batch.pos      + i,
+		batch.n_seq_id + i,
+		batch.seq_id   + i,
+		batch.logits   + i,
+		0, 0, 0, // unused
+	    };
 
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0)
-            {
-                if (n_batch == 1 || ret < 0)
-                {
-                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
-                    return false;
-                }
+	    const int ret = llama_decode(ctx, batch_view);
+	    if (ret != 0)
+	    {
+		if (n_batch == 1 || ret < 0)
+		{
+		    // if you get here, it means the KV cache is full - try increasing it via the context size
+		    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+		    return false;
+		}
 
-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+		LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
 
-                // retry with half the batch size to try to find a free slot in the KV cache
-                n_batch /= 2;
-                i -= n_batch;
-                continue;
-            }
+		// retry with half the batch size to try to find a free slot in the KV cache
+		n_batch /= 2;
+		i -= n_batch;
+		continue;
+	    }
 
-            for (auto & slot : slots)
-            {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
-                {
-                    continue;
-                }
+	    for (auto & slot : slots)
+	    {
+		if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
+		{
+		    continue;
+		}
 
-                // prompt evaluated for embedding
-                if (params.embedding)
-                {
-                    send_embedding(slot);
-                    slot.release();
-                    slot.i_batch = -1;
-                    return true;
-                }
+		// prompt evaluated for embedding
+		if (params.embedding)
+		{
+		    send_embedding(slot);
+		    slot.release();
+		    slot.i_batch = -1;
+		    return true;
+		}
 
-                completion_token_output result;
-                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+		completion_token_output result;
+		const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
 
-                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+		llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
 
-                if (slot.n_decoded == 1)
-                {
-                    slot.t_start_genereration = ggml_time_us();
-                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
-                }
+		if (slot.n_decoded == 1)
+		{
+		    slot.t_start_genereration = ggml_time_us();
+		    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+		}
 
-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
-                result.tok = id;
+		llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
+		result.tok = id;
 
-                const int32_t n_probs = slot.sparams.n_probs;
-                if (slot.sparams.temp <= 0 && n_probs > 0)
-                {
-                    // for llama_sample_token_greedy we need to sort candidates
-                    llama_sample_softmax(ctx, &cur_p);
-                }
+		const int32_t n_probs = slot.sparams.n_probs;
+		if (slot.sparams.temp <= 0 && n_probs > 0)
+		{
+		    // for llama_sample_token_greedy we need to sort candidates
+		    llama_sample_softmax(ctx, &cur_p);
+		}
 
-                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
-                {
-                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
-                }
+		for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+		{
+		    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+		}
 
-                if (!process_token(result, slot))
-                {
-                    slot.release();
-                    send_final_response(slot);
-                    slot.print_timings();
-                }
+		if (!process_token(result, slot))
+		{
+		    slot.release();
+		    send_final_response(slot);
+		    slot.print_timings();
+		}
 
-                slot.i_batch = -1;
-            }
-        }
-        return true;
+		slot.i_batch = -1;
+	    }
+	}
+	return true;
     }
 };
 
 static void server_print_usage(const char *argv0, const gpt_params &params,
-                               const server_params &sparams)
+			       const server_params &sparams)
 {
     printf("usage: %s [options]\n", argv0);
     printf("\n");
@@ -1777,11 +1866,11 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     if (llama_mlock_supported())
     {
-        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+	printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
     if (llama_mmap_supported())
     {
-        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+	printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
     printf("  --numa                attempt optimizations that help on some NUMA systems\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -1814,7 +1903,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
 }
 
 static void server_params_parse(int argc, char **argv, server_params &sparams,
-                                gpt_params &params, llama_server_context& llama)
+				gpt_params &params, llama_server_context& llama)
 {
     gpt_params default_params;
     server_params default_sparams;
@@ -1823,306 +1912,306 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
     for (int i = 1; i < argc; i++)
     {
-        arg = argv[i];
-        if (arg == "--port")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.port = std::stoi(argv[i]);
-        }
-        else if (arg == "--host")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.hostname = argv[i];
-        }
-        else if (arg == "--path")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.public_path = argv[i];
-        }
-        else if (arg == "--timeout" || arg == "-to")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            sparams.read_timeout = std::stoi(argv[i]);
-            sparams.write_timeout = std::stoi(argv[i]);
-        }
-        else if (arg == "-m" || arg == "--model")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        }
-        else if (arg == "-a" || arg == "--alias")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.model_alias = argv[i];
-        }
-        else if (arg == "-h" || arg == "--help")
-        {
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(0);
-        }
-        else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        }
-        else if (arg == "--rope-freq-base")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_base = std::stof(argv[i]);
-        }
-        else if (arg == "--rope-freq-scale")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_scale = std::stof(argv[i]);
-        }
-        else if (arg == "--memory-f32" || arg == "--memory_f32")
-        {
-            params.memory_f16 = false;
-        }
-        else if (arg == "--threads" || arg == "-t")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        }
-        else if (arg == "-b" || arg == "--batch-size")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
-        }
-        else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
+	arg = argv[i];
+	if (arg == "--port")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    sparams.port = std::stoi(argv[i]);
+	}
+	else if (arg == "--host")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    sparams.hostname = argv[i];
+	}
+	else if (arg == "--path")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    sparams.public_path = argv[i];
+	}
+	else if (arg == "--timeout" || arg == "-to")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    sparams.read_timeout = std::stoi(argv[i]);
+	    sparams.write_timeout = std::stoi(argv[i]);
+	}
+	else if (arg == "-m" || arg == "--model")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.model = argv[i];
+	}
+	else if (arg == "-a" || arg == "--alias")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.model_alias = argv[i];
+	}
+	else if (arg == "-h" || arg == "--help")
+	{
+	    server_print_usage(argv[0], default_params, default_sparams);
+	    exit(0);
+	}
+	else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.n_ctx = std::stoi(argv[i]);
+	}
+	else if (arg == "--rope-freq-base")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.rope_freq_base = std::stof(argv[i]);
+	}
+	else if (arg == "--rope-freq-scale")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.rope_freq_scale = std::stof(argv[i]);
+	}
+	else if (arg == "--memory-f32" || arg == "--memory_f32")
+	{
+	    params.memory_f16 = false;
+	}
+	else if (arg == "--threads" || arg == "-t")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.n_threads = std::stoi(argv[i]);
+	}
+	else if (arg == "-b" || arg == "--batch-size")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.n_batch = std::stoi(argv[i]);
+	    params.n_batch = std::min(512, params.n_batch);
+	}
+	else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-            params.n_gpu_layers = std::stoi(argv[i]);
+	    params.n_gpu_layers = std::stoi(argv[i]);
 #else
-            LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
-                        "See main README.md for information on enabling GPU BLAS support",
-                        {{"n_gpu_layers", params.n_gpu_layers}});
+	    LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
+			"See main README.md for information on enabling GPU BLAS support",
+			{{"n_gpu_layers", params.n_gpu_layers}});
 #endif
-        }
-        else if (arg == "--tensor-split" || arg == "-ts")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
+	}
+	else if (arg == "--tensor-split" || arg == "-ts")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
 #ifdef GGML_USE_CUBLAS
-            std::string arg_next = argv[i];
+	    std::string arg_next = argv[i];
 
-            // split string by , and /
-            const std::regex regex{R"([,/]+)"};
-            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-            std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+	    // split string by , and /
+	    const std::regex regex{R"([,/]+)"};
+	    std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+	    std::vector<std::string> split_arg{it, {}};
+	    GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
 
-            for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
-            {
-                if (i_device < split_arg.size())
-                {
-                    params.tensor_split[i_device] = std::stof(split_arg[i_device]);
-                }
-                else
-                {
-                    params.tensor_split[i_device] = 0.0f;
-                }
-            }
+	    for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
+	    {
+		if (i_device < split_arg.size())
+		{
+		    params.tensor_split[i_device] = std::stof(split_arg[i_device]);
+		}
+		else
+		{
+		    params.tensor_split[i_device] = 0.0f;
+		}
+	    }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
+	    LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
 #endif // GGML_USE_CUBLAS
-        }
-        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
-        {
+	}
+	else if (arg == "--no-mul-mat-q" || arg == "-nommq")
+	{
 #ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = false;
+	    params.mul_mat_q = false;
 #else
-            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
+	    LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
 #endif // GGML_USE_CUBLAS
-        }
-        else if (arg == "--main-gpu" || arg == "-mg")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
+	}
+	else if (arg == "--main-gpu" || arg == "-mg")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
 #ifdef GGML_USE_CUBLAS
-            params.main_gpu = std::stoi(argv[i]);
+	    params.main_gpu = std::stoi(argv[i]);
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
+	    LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
 #endif
-        }
-        else if (arg == "--lora")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
-            params.use_mmap = false;
-        }
-        else if (arg == "--lora-scaled")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            const char * lora_adapter = argv[i];
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
-            params.use_mmap = false;
-        }
-        else if (arg == "--lora-base")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        }
-        else if (arg == "-v" || arg == "--verbose")
-        {
+	}
+	else if (arg == "--lora")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
+	    params.use_mmap = false;
+	}
+	else if (arg == "--lora-scaled")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    const char * lora_adapter = argv[i];
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
+	    params.use_mmap = false;
+	}
+	else if (arg == "--lora-base")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.lora_base = argv[i];
+	}
+	else if (arg == "-v" || arg == "--verbose")
+	{
 #if SERVER_VERBOSE != 1
-            LOG_WARNING("server.cpp is not built with verbose logging.", {});
+	    LOG_WARNING("server.cpp is not built with verbose logging.", {});
 #else
-            server_verbose = true;
+	    server_verbose = true;
 #endif
-        }
-        else if (arg == "--mlock")
-        {
-            params.use_mlock = true;
-        }
-        else if (arg == "--no-mmap")
-        {
-            params.use_mmap = false;
-        }
-        else if (arg == "--numa")
-        {
-            params.numa = true;
-        }
-        else if (arg == "--embedding")
-        {
-            params.embedding = true;
-        }
-        else if (arg == "-cb" || arg == "--cont-batching")
-        {
-            params.cont_batching = true;
-        }
-        else if (arg == "-np" || arg == "--parallel")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--n-predict")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "-spf" || arg == "--system-prompt-file")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::string systm_content;
-            std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(systm_content)
-            );
-            llama.process_system_prompt_data(json::parse(systm_content));
-        }
-        else if(arg == "--mmproj")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.mmproj = argv[i];
-        }
-        else
-        {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(1);
-        }
+	}
+	else if (arg == "--mlock")
+	{
+	    params.use_mlock = true;
+	}
+	else if (arg == "--no-mmap")
+	{
+	    params.use_mmap = false;
+	}
+	else if (arg == "--numa")
+	{
+	    params.numa = true;
+	}
+	else if (arg == "--embedding")
+	{
+	    params.embedding = true;
+	}
+	else if (arg == "-cb" || arg == "--cont-batching")
+	{
+	    params.cont_batching = true;
+	}
+	else if (arg == "-np" || arg == "--parallel")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.n_parallel = std::stoi(argv[i]);
+	} else if (arg == "-n" || arg == "--n-predict")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.n_predict = std::stoi(argv[i]);
+	} else if (arg == "-spf" || arg == "--system-prompt-file")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    std::ifstream file(argv[i]);
+	    if (!file) {
+		fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+		invalid_param = true;
+		break;
+	    }
+	    std::string systm_content;
+	    std::copy(
+		std::istreambuf_iterator<char>(file),
+		std::istreambuf_iterator<char>(),
+		std::back_inserter(systm_content)
+	    );
+	    llama.process_system_prompt_data(json::parse(systm_content));
+	}
+	else if(arg == "--mmproj")
+	{
+	    if (++i >= argc)
+	    {
+		invalid_param = true;
+		break;
+	    }
+	    params.mmproj = argv[i];
+	}
+	else
+	{
+	    fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+	    server_print_usage(argv[0], default_params, default_sparams);
+	    exit(1);
+	}
     }
 
     if (invalid_param)
     {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        server_print_usage(argv[0], default_params, default_sparams);
-        exit(1);
+	fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+	server_print_usage(argv[0], default_params, default_sparams);
+	exit(1);
     }
 }
 
@@ -2131,15 +2220,15 @@ static json format_partial_response(
 ) {
     json res = json
     {
-        {"content",    content },
-        {"stop",       false},
-        {"slot_id",    slot->id },
-        {"multimodal", llama.multimodal }
+	{"content",    content },
+	{"stop",       false},
+	{"slot_id",    slot->id },
+	{"multimodal", llama.multimodal }
     };
 
     if (slot->sparams.n_probs > 0)
     {
-        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
+	res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
 
     return res;
@@ -2148,31 +2237,31 @@ static json format_partial_response(
 static json format_tokenizer_response(const std::vector<llama_token> &tokens)
 {
     return json{
-        {"tokens", tokens}};
+	{"tokens", tokens}};
 }
 
 static json format_detokenized_response(std::string content)
 {
     return json{
-        {"content", content}};
+	{"content", content}};
 }
 
 
 static void log_server_request(const httplib::Request &req, const httplib::Response &res)
 {
     LOG_INFO("request", {
-                            {"remote_addr", req.remote_addr},
-                            {"remote_port", req.remote_port},
-                            {"status", res.status},
-                            {"method", req.method},
-                            {"path", req.path},
-                            {"params", req.params},
-                        });
+			    {"remote_addr", req.remote_addr},
+			    {"remote_port", req.remote_port},
+			    {"status", res.status},
+			    {"method", req.method},
+			    {"path", req.path},
+			    {"params", req.params},
+			});
 
     LOG_VERBOSE("request", {
-                               {"request", req.body},
-                               {"response", res.body},
-                           });
+			       {"request", req.body},
+			       {"response", res.body},
+			   });
 }
 
 struct token_translator
@@ -2190,11 +2279,11 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
     const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
     if (slot->generated_text.capacity() < slot->generated_text.size() + len)
     {
-        slot->generated_text.reserve(slot->generated_text.size() + len);
+	slot->generated_text.reserve(slot->generated_text.size() + len);
     }
     for (const completion_token_output & cto : gtps)
     {
-        slot->generated_text += translator(cto);
+	slot->generated_text += translator(cto);
     }
 }
 
@@ -2211,25 +2300,25 @@ int main(int argc, char **argv)
 
     if (params.model_alias == "unknown")
     {
-        params.model_alias = params.model;
+	params.model_alias = params.model;
     }
 
     llama_backend_init(params.numa);
 
     LOG_INFO("build info", {{"build", BUILD_NUMBER},
-                            {"commit", BUILD_COMMIT}});
+			    {"commit", BUILD_COMMIT}});
 
     LOG_INFO("system info", {
-                                {"n_threads", params.n_threads},
-                                {"n_threads_batch", params.n_threads_batch},
-                                {"total_threads", std::thread::hardware_concurrency()},
-                                {"system_info", llama_print_system_info()},
-                            });
+				{"n_threads", params.n_threads},
+				{"n_threads_batch", params.n_threads_batch},
+				{"total_threads", std::thread::hardware_concurrency()},
+				{"system_info", llama_print_system_info()},
+			    });
 
     // load the model
     if (!llama.load_model(params))
     {
-        return 1;
+	return 1;
     }
 
     llama.initialize();
@@ -2237,248 +2326,260 @@ int main(int argc, char **argv)
     httplib::Server svr;
 
     svr.set_default_headers({{"Server", "llama.cpp"},
-                             {"Access-Control-Allow-Origin", "*"},
-                             {"Access-Control-Allow-Headers", "content-type"}});
+			     {"Access-Control-Allow-Origin", "*"},
+			     {"Access-Control-Allow-Headers", "content-type"}});
 
     // this is only called if no index.html is found in the public --path
     svr.Get("/", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
-                return false;
-            });
+	    {
+		res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
+		return false;
+	    });
 
     // this is only called if no index.js is found in the public --path
     svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
-                return false;
-            });
+	    {
+		res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
+		return false;
+	    });
 
     // this is only called if no index.html is found in the public --path
     svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
-                return false;
-            });
+	    {
+		res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
+		return false;
+	    });
 
     // this is only called if no index.html is found in the public --path
     svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
-            {
-                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
-                return false;
-            });
+	    {
+		res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
+		return false;
+	    });
 
     svr.Get("/props", [&llama](const httplib::Request & /*req*/, httplib::Response &res)
-            {
-                res.set_header("Access-Control-Allow-Origin", "*");
-                json data = {
-                    { "user_name",      llama.name_user.c_str() },
-                    { "assistant_name", llama.name_assistant.c_str() }
-                };
-                res.set_content(data.dump(), "application/json");
-            });
+	    {
+		res.set_header("Access-Control-Allow-Origin", "*");
+		json data = {
+		    { "user_name",      llama.name_user.c_str() },
+		    { "assistant_name", llama.name_assistant.c_str() }
+		};
+		res.set_content(data.dump(), "application/json");
+	    });
 
     svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false);
-                if (!json_value(data, "stream", false)) {
-                    std::string completion_text;
-                    task_result result = llama.next_result(task_id);
-                    if(!result.error && result.stop) {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
-                    }
-                    else
-                    {
-                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain");
-                        return;
-                    }
-                } else {
-                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
-                    {
-                        while (true)
-                        {
-                            task_result result = llama.next_result(task_id);
-                            if (!result.error) {
-                                const std::string str =
-                                "data: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    return false;
-                                }
-                                if(result.stop) {
-                                    break;
-                                }
-                            } else {
-                                break;
-                            }
-                        }
-                        sink.done();
-                        return true;
-                    };
+	    {
+		json data = json::parse(req.body);
+		const int task_id = llama.request_completion(data, false);
+		if (!json_value(data, "stream", false)) {
+		    std::string completion_text;
+		    task_result result = llama.next_result(task_id);
+		    if(!result.error && result.stop) {
+			res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+		    }
+		    else
+		    {
+			res.status = 404;
+			res.set_content(result.result_json["content"], "text/plain");
+			return;
+		    }
+		} else {
+		    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
+		    {
+			while (true)
+			{
+			    task_result result = llama.next_result(task_id);
+			    if (!result.error) {
+				const std::string str =
+				"data: " +
+				result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+				"\n\n";
+				LOG_VERBOSE("data stream", {
+				    { "to_send", str }
+				});
+				if (!sink.write(str.c_str(), str.size()))
+				{
+				    return false;
+				}
+				if(result.stop) {
+				    break;
+				}
+			    } else {
+				break;
+			    }
+			}
+			sink.done();
+			return true;
+		    };
 
-                    auto on_complete = [task_id, &llama] (bool)
-                    {
-                        // cancel
-                        llama.request_cancel(task_id);
-                    };
+		    auto on_complete = [task_id, &llama] (bool)
+		    {
+			// cancel
+			llama.request_cancel(task_id);
+		    };
 
-                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
+		    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+		}
+	    });
 
     svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true);
-                if (!json_value(data, "stream", false)) {
-                    std::string completion_text;
-                    task_result result = llama.next_result(task_id);
-                    if (!result.error && result.stop)
-                    {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
-                    }
-                    else
-                    {
-                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain");
-                        return;
-                    }
-                } else {
-                    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
-                        while (true)
-                        {
-                            task_result result = llama.next_result(task_id);
-                            if (!result.error) {
-                                const std::string str =
-                                "data: " +
-                                result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
-                                LOG_VERBOSE("data stream", {
-                                    { "to_send", str }
-                                });
-                                if (!sink.write(str.c_str(), str.size()))
-                                {
-                                    return false;
-                                }
-                                if (result.stop)
-                                {
-                                    break;
-                                }
-                            }
-                            else
-                            {
-                                break;
-                            }
-                        }
+	    {
+		json data = json::parse(req.body);
+		const int task_id = llama.request_completion(data, true);
+		if (!json_value(data, "stream", false)) {
+		    std::string completion_text;
+		    task_result result = llama.next_result(task_id);
+		    if (!result.error && result.stop)
+		    {
+			res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+		    }
+		    else
+		    {
+			res.status = 404;
+			res.set_content(result.result_json["content"], "text/plain");
+			return;
+		    }
+		} else {
+		    const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
+			while (true)
+			{
+			    task_result result = llama.next_result(task_id);
+			    if (!result.error) {
+				const std::string str =
+				"data: " +
+				result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
+				"\n\n";
+				LOG_VERBOSE("data stream", {
+				    { "to_send", str }
+				});
+				if (!sink.write(str.c_str(), str.size()))
+				{
+				    return false;
+				}
+				if (result.stop)
+				{
+				    break;
+				}
+			    }
+			    else
+			    {
+				break;
+			    }
+			}
 
-                        sink.done();
+			sink.done();
 
-                        return true;
-                    };
+			return true;
+		    };
 
-                    auto on_complete = [task_id, &llama] (bool)
-                    {
-                        // cancel
-                        llama.request_cancel(task_id);
-                    };
+		    auto on_complete = [task_id, &llama] (bool)
+		    {
+			// cancel
+			llama.request_cancel(task_id);
+		    };
 
-                    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
+		    res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+		}
+	    });
 
     svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
-            {
-                const json data = llama.get_model_props();
-                return res.set_content(data.dump(), "application/json");
-            });
+	    {
+		const json data = llama.get_model_props();
+		return res.set_content(data.dump(), "application/json");
+	    });
 
     svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
-                { return res.set_content("", "application/json"); });
+		{ return res.set_content("", "application/json"); });
 
     svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                const json body = json::parse(req.body);
-                std::vector<llama_token> tokens;
-                if (body.count("content") != 0)
-                {
-                    tokens = llama.tokenize(body["content"], false);
-                }
-                const json data = format_tokenizer_response(tokens);
-                return res.set_content(data.dump(), "application/json");
-            });
+	    {
+		const json body = json::parse(req.body);
+		std::vector<llama_token> tokens;
+		if (body.count("content") != 0)
+		{
+		    tokens = llama.tokenize(body["content"], false);
+		}
+		const json data = format_tokenizer_response(tokens);
+		return res.set_content(data.dump(), "application/json");
+	    });
+
+    svr.Post("/tokenizes", [&llama](const httplib::Request &req, httplib::Response &res)
+	    {
+		const json body = json::parse(req.body);
+		std::vector<llama_token> tokens;
+		if (body.count("content") != 0)
+		{
+		    tokens = llama.tokenize(body["content"], false, true);
+		}
+		const json data = format_tokenizer_response(tokens);
+		return res.set_content(data.dump(), "application/json");
+	    });
 
     svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                const json body = json::parse(req.body);
-                std::string content;
-                if (body.count("tokens") != 0)
-                {
-                    const std::vector<llama_token> tokens = body["tokens"];
-                    content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
-                }
+	    {
+		const json body = json::parse(req.body);
+		std::string content;
+		if (body.count("tokens") != 0)
+		{
+		    const std::vector<llama_token> tokens = body["tokens"];
+		    content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+		}
 
-                const json data = format_detokenized_response(content);
-                return res.set_content(data.dump(), "application/json");
-            });
+		const json data = format_detokenized_response(content);
+		return res.set_content(data.dump(), "application/json");
+	    });
 
     svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
-                const json body = json::parse(req.body);
-                json prompt;
-                if (body.count("content") != 0)
-                {
-                    prompt = body["content"];
-                }
-                else
-                {
-                    prompt = "";
-                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
-                task_result result = llama.next_result(task_id);
-                return res.set_content(result.result_json.dump(), "application/json");
-            });
+	    {
+		const json body = json::parse(req.body);
+		json prompt;
+		if (body.count("content") != 0)
+		{
+		    prompt = body["content"];
+		}
+		else
+		{
+		    prompt = "";
+		}
+		const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
+		task_result result = llama.next_result(task_id);
+		return res.set_content(result.result_json.dump(), "application/json");
+	    });
 
     svr.set_logger(log_server_request);
 
     svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
-            {
-                const char fmt[] = "500 Internal Server Error\n%s";
-                char buf[BUFSIZ];
-                try
-                {
-                    std::rethrow_exception(std::move(ep));
-                }
-                catch (std::exception &e)
-                {
-                    snprintf(buf, sizeof(buf), fmt, e.what());
-                }
-                catch (...)
-                {
-                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
-                }
-                res.set_content(buf, "text/plain");
-                res.status = 500;
-            });
+	    {
+		const char fmt[] = "500 Internal Server Error\n%s";
+		char buf[BUFSIZ];
+		try
+		{
+		    std::rethrow_exception(std::move(ep));
+		}
+		catch (std::exception &e)
+		{
+		    snprintf(buf, sizeof(buf), fmt, e.what());
+		}
+		catch (...)
+		{
+		    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
+		}
+		res.set_content(buf, "text/plain");
+		res.status = 500;
+	    });
 
     svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
-            {
-                if (res.status == 400)
-                {
-                    res.set_content("Invalid request", "text/plain");
-                }
-                else if (res.status != 500)
-                {
-                    res.set_content("File Not Found", "text/plain");
-                    res.status = 404;
-                }
-            });
+	    {
+		if (res.status == 400)
+		{
+		    res.set_content("Invalid request", "text/plain");
+		}
+		else if (res.status != 500)
+		{
+		    res.set_content("File Not Found", "text/plain");
+		    res.status = 404;
+		}
+	    });
 
     // set timeouts and change hostname and port
     svr.set_read_timeout (sparams.read_timeout);
@@ -2486,8 +2587,8 @@ int main(int argc, char **argv)
 
     if (!svr.bind_to_port(sparams.hostname, sparams.port))
     {
-        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
-        return 1;
+	fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
+	return 1;
     }
 
     // Set the base directory for serving static files
@@ -2497,30 +2598,30 @@ int main(int argc, char **argv)
     LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
 
     LOG_INFO("HTTP server listening", {
-                                          {"hostname", sparams.hostname},
-                                          {"port", sparams.port},
-                                      });
+					  {"hostname", sparams.hostname},
+					  {"port", sparams.port},
+				      });
 
     // run the HTTP server in a thread - see comment below
     std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    return 1;
-                }
+	    {
+		if (!svr.listen_after_bind())
+		{
+		    return 1;
+		}
 
-                return 0;
-            });
+		return 0;
+	    });
 
     // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
     //     "Bus error: 10" - this is on macOS, it does not crash on Linux
     //std::thread t2([&]()
     {
-        bool running = true;
-        while (running)
-        {
-            running = llama.update_slots();
-        }
+	bool running = true;
+	while (running)
+	{
+	    running = llama.update_slots();
+	}
     }
     //);