From c1cbde82a12d59a0ee8ae2ae6025c99f18c1e526 Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 00:00:56 -0300
Subject: [PATCH 01/10] print error when server can't bind to the interface

---
 examples/server/server.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ad46f56e9..5c1662865 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -901,8 +901,6 @@ int main(int argc, char **argv)
               return res.set_content(data.dump(llama.json_indent), "application/json");
             });
 
-  fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
-
   if(params.embedding) {
     fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
   }
@@ -930,5 +928,16 @@ int main(int argc, char **argv)
   // set timeouts and change hostname and port
   svr.set_read_timeout(sparams.read_timeout);
   svr.set_write_timeout(sparams.write_timeout);
-  svr.listen(sparams.hostname, sparams.port);
+
+  if (!svr.bind_to_port(sparams.hostname, sparams.port)) {
+      fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__,
+              sparams.hostname.c_str(), sparams.port);
+      return 1;
+  }
+
+  fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__,
+          sparams.hostname.c_str(), sparams.port);
+  if (!svr.listen_after_bind()) {
+      return 1;
+  }
 }

From 2c08f29691d6a69bb1c26db2a239e8a8124c313d Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 09:02:32 -0300
Subject: [PATCH 02/10] make api server use only a single thread

---
 examples/server/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index b38fa864a..67b086754 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -2,6 +2,9 @@ set(TARGET server)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp json.hpp httplib.h)
 target_compile_definitions(${TARGET} PRIVATE
+    # single thread
+    CPPHTTPLIB_THREAD_POOL_COUNT=1
+    # crash the server in the debug mode, otherwise send http 500 error
     $<$<CONFIG:Debug>:
         CPPHTTPLIB_NO_EXCEPTIONS=1
     >

From 284bc293b1e003659416e776d1b9528ebca38d10 Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 10:46:06 -0300
Subject: [PATCH 03/10] reserve memory for generated_text

---
 examples/server/server.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5c1662865..b42333228 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -42,6 +42,7 @@ struct llama_server_context
     params.antiprompt.clear();
     num_tokens_predicted = 0;
     generated_text = "";
+    generated_text.reserve(params.n_ctx);
     stopping_word = "";
 
     //processed_tokens.clear();

From f1710b90dcd4fb47a170e8e05faceb26ed594580 Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 10:35:25 -0300
Subject: [PATCH 04/10] add infinite generation when n_predict is -1

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b42333228..b0f0486b7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -250,7 +250,7 @@ struct llama_server_context
         return result;
     }
 
-    has_next_token = n_remain != 0;
+    has_next_token = params.n_predict == -1 ? true : n_remain != 0;
     return result;
   }
 

From aa2bbb2d357617907278b5102abbae49bab2236a Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 10:36:51 -0300
Subject: [PATCH 05/10] fix parameter type

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b0f0486b7..37b5b78d3 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -52,7 +52,7 @@ struct llama_server_context
     n_consumed = 0;
   }
 
-  bool loadModel(gpt_params params_)
+  bool loadModel(const gpt_params &params_)
   {
     params = params_;
     ctx = llama_init_from_gpt_params(params);

From 27911d6d68d465dc944af508aeb284288019eb3b Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 10:37:52 -0300
Subject: [PATCH 06/10] fix default model alias

---
 examples/server/server.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 37b5b78d3..fbfcc6b7f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -781,6 +781,10 @@ int main(int argc, char **argv)
   llama.verbose = sparams.verbose;
   llama.json_indent = sparams.verbose ? 4 : -1;
 
+  if (params.model_alias == "unknown") {
+    params.model_alias = params.model;
+  }
+
   // load the model
   if (!llama.loadModel(params))
   {

From dd3021933232573bfdde2cb249c22ab332d353f3 Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 10:40:42 -0300
Subject: [PATCH 07/10] buffer incomplete multi-byte characters

---
 examples/server/server.cpp | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index fbfcc6b7f..b78992a13 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -842,16 +842,49 @@ int main(int argc, char **argv)
               "application/json");
       } else {
           const auto chunked_content_provider = [&](size_t, DataSink &sink) {
+              size_t sent_count = 0;
+              int32_t multibyte_pending = 0;
+
               while (llama.has_next_token) {
                   std::string token_text = llama.doCompletion();
 
+                  if (multibyte_pending > 0) {
+                      multibyte_pending -= token_text.size();
+                  } else if (token_text.size() == 1) {
+                      const char c = token_text[0];
+                      // 2-byte characters: 110xxxxx 10xxxxxx
+                      if ((c & 0xE0) == 0xC0) {
+                          multibyte_pending = 1;
+                      // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                      } else if ((c & 0xF0) == 0xE0) {
+                          multibyte_pending = 2;
+                      // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                      } else if ((c & 0xF8) == 0xF0) {
+                          multibyte_pending = 3;
+                      } else {
+                          multibyte_pending = 0;
+                      }
+                  }
+
+                  if (multibyte_pending > 0) {
+                      if (!llama.has_next_token) {
+                          llama.has_next_token = true;
+                          llama.n_remain++;
+                      }
+                      continue;
+                  }
+
+                  const size_t pos = std::min(sent_count, llama.generated_text.size());
+                  std::string to_send = llama.generated_text.substr(pos);
+                  sent_count += to_send.size();
+
                   json data;
                   if (llama.has_next_token) {
-                      data = {{"content", token_text}, {"stop", false}};
+                      data = {{"content", to_send}, {"stop", false}};
                   } else {
                       // Generation is done, send extra information.
                       data = {
-                          {"content", token_text},
+                          {"content", to_send},
                           {"stop", true},
                           {"model", llama.params.model_alias},
                           {"tokens_predicted", llama.num_tokens_predicted},

From 40e13805d983c93598249c2673ba9fc4e8f1dc0d Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 10:41:47 -0300
Subject: [PATCH 08/10] print timings + build info

I don't know if llama_free is needed but it was used in main.cpp.
---
 examples/server/server.cpp | 49 ++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b78992a13..acccbc9d7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,7 +1,9 @@
-#include <httplib.h>
-#include <json.hpp>
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
+
+#include <httplib.h>
+#include <json.hpp>
 
 struct server_params
 {
@@ -30,7 +32,7 @@ struct llama_server_context
   std::vector<llama_token> embd_inp;
 
   std::vector<llama_token> last_prompt_tokens;
-  llama_context *ctx;
+  llama_context *ctx = nullptr;
   gpt_params params;
 
   std::string stopping_word;
@@ -38,6 +40,14 @@ struct llama_server_context
   bool verbose = false;
   int json_indent = -1;
 
+  ~llama_server_context()
+  {
+      if (ctx) {
+          llama_free(ctx);
+          ctx = nullptr;
+      }
+  }
+
   void rewind() {
     params.antiprompt.clear();
     num_tokens_predicted = 0;
@@ -765,6 +775,8 @@ std::string log(const Request &req, const Response &res)
 
 int main(int argc, char **argv)
 {
+  llama_init_backend();
+
   // own arguments required by this example
   gpt_params params;
   server_params sparams;
@@ -785,6 +797,10 @@ int main(int argc, char **argv)
     params.model_alias = params.model;
   }
 
+  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+  fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads,
+          std::thread::hardware_concurrency(), llama_print_system_info());
+
   // load the model
   if (!llama.loadModel(params))
   {
@@ -809,6 +825,7 @@ int main(int argc, char **argv)
       }
 
       llama.rewind();
+      llama_reset_timings(llama.ctx);
 
       if (parse_options_completion(json::parse(req.body), llama, res) == false) {
           return;
@@ -837,6 +854,11 @@ int main(int argc, char **argv)
                        {"generation_settings", format_generation_settings(llama)},
                        {"prompt", llama.params.prompt},
                        {"stopping_word", llama.stopping_word}};
+
+          if (llama.verbose) {
+              llama_print_timings(llama.ctx);
+          }
+
           return res.set_content(
               data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
               "application/json");
@@ -894,18 +916,29 @@ int main(int argc, char **argv)
                           {"generated_text", llama.generated_text}};
                   }
 
-                  std::string str = "data: " +
-                                    data.dump(llama.json_indent, ' ', false,
-                                              json::error_handler_t::replace) +
-                                    "\n\n";
+                  std::string str =
+                      "data: " +
+                      data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false,
+                                json::error_handler_t::replace) +
+                      "\n\n";
+
+                  if (llama.verbose) {
+                      fprintf(stderr, "to_send=%s", str.c_str());
+                  }
+
                   if (!sink.write(str.data(), str.size())) {
                       if (llama.verbose) {
                           fprintf(stderr, "stream closed\n");
+                          llama_print_timings(llama.ctx);
                       }
                       return false;
                   }
               }
 
+              if (llama.verbose) {
+                  llama_print_timings(llama.ctx);
+              }
+
               sink.done();
               return true;
           };
@@ -978,4 +1011,6 @@ int main(int argc, char **argv)
   if (!svr.listen_after_bind()) {
       return 1;
   }
+
+  return 0;
 }

From d58e48663d119d439abbd388390f7101dec3bbe5 Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 11:56:12 -0300
Subject: [PATCH 09/10] default penalize_nl to false + format

---
 examples/server/server.cpp | 287 +++++++++++++++----------------------
 1 file changed, 114 insertions(+), 173 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index acccbc9d7..eb75ab1de 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -507,210 +507,151 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
   return true;
 }
 
-bool parse_options_completion(json body, llama_server_context& llama, Response &res) {
+bool parse_options_completion(json body, llama_server_context& llama, Response &res)
+{
   gpt_params default_params;
-  if (!body["stream"].is_null())
-  {
-      llama.stream = body["stream"].get<bool>();
+  if (!body["stream"].is_null()) {
+    llama.stream = body["stream"].get<bool>();
+  } else {
+    llama.stream = false;
   }
-  else
-  {
-      llama.stream = false;
+  if (!body["n_predict"].is_null()) {
+    llama.params.n_predict = body["n_predict"].get<int>();
+  } else {
+    llama.params.n_predict = default_params.n_predict;
   }
-  if (!body["n_predict"].is_null())
-  {
-      llama.params.n_predict = body["n_predict"].get<int>();
+  if (!body["top_k"].is_null()) {
+    llama.params.top_k = body["top_k"].get<int>();
+  } else {
+    llama.params.top_k = default_params.top_k;
   }
-  else
-  {
-      llama.params.n_predict = default_params.n_predict;
+  if (!body["top_p"].is_null()) {
+    llama.params.top_p = body["top_p"].get<float>();
+  } else {
+    llama.params.top_p = default_params.top_p;
   }
-  if (!body["top_k"].is_null())
-  {
-      llama.params.top_k = body["top_k"].get<int>();
+  if (!body["tfs_z"].is_null()) {
+    llama.params.tfs_z = body["tfs_z"].get<float>();
+  } else {
+    llama.params.tfs_z = default_params.tfs_z;
   }
-  else
-  {
-      llama.params.top_k = default_params.top_k;
+  if (!body["typical_p"].is_null()) {
+    llama.params.typical_p = body["typical_p"].get<float>();
+  } else {
+    llama.params.typical_p = default_params.typical_p;
   }
-  if (!body["top_p"].is_null())
-  {
-      llama.params.top_p = body["top_p"].get<float>();
+  if (!body["repeat_last_n"].is_null()) {
+    llama.params.repeat_last_n = body["repeat_last_n"].get<int>();
+  } else {
+    llama.params.repeat_last_n = default_params.repeat_last_n;
   }
-  else
-  {
-      llama.params.top_p = default_params.top_p;
+  if (!body["temperature"].is_null()) {
+    llama.params.temp = body["temperature"].get<float>();
+  } else {
+    llama.params.temp = default_params.temp;
   }
-  if (!body["tfs_z"].is_null())
-  {
-      llama.params.tfs_z = body["tfs_z"].get<float>();
+  if (!body["repeat_penalty"].is_null()) {
+    llama.params.repeat_penalty = body["repeat_penalty"].get<float>();
+  } else {
+    llama.params.repeat_penalty = default_params.repeat_penalty;
   }
-  else
-  {
-      llama.params.tfs_z = default_params.tfs_z;
+  if (!body["presence_penalty"].is_null()) {
+    llama.params.presence_penalty = body["presence_penalty"].get<float>();
+  } else {
+    llama.params.presence_penalty = default_params.presence_penalty;
   }
-  if (!body["typical_p"].is_null())
-  {
-      llama.params.typical_p = body["typical_p"].get<float>();
+  if (!body["frequency_penalty"].is_null()) {
+    llama.params.frequency_penalty = body["frequency_penalty"].get<float>();
+  } else {
+    llama.params.frequency_penalty = default_params.frequency_penalty;
   }
-  else
-  {
-      llama.params.typical_p = default_params.typical_p;
+  if (!body["mirostat"].is_null()) {
+    llama.params.mirostat = body["mirostat"].get<float>();
+  } else {
+    llama.params.mirostat = default_params.mirostat;
   }
-  if (!body["repeat_last_n"].is_null())
-  {
-      llama.params.repeat_last_n = body["repeat_last_n"].get<int>();
+  if (!body["mirostat_tau"].is_null()) {
+    llama.params.mirostat_tau = body["mirostat_tau"].get<float>();
+  } else {
+    llama.params.mirostat_tau = default_params.mirostat_tau;
   }
-  else
-  {
-      llama.params.repeat_last_n = default_params.repeat_last_n;
+  if (!body["mirostat_eta"].is_null()) {
+    llama.params.mirostat_eta = body["mirostat_eta"].get<float>();
+  } else {
+    llama.params.mirostat_eta = default_params.mirostat_eta;
   }
-  if (!body["temperature"].is_null())
-  {
-      llama.params.temp = body["temperature"].get<float>();
+  if (!body["penalize_nl"].is_null()) {
+    llama.params.penalize_nl = body["penalize_nl"].get<float>();
+  } else {
+    llama.params.penalize_nl = false;
   }
-  else
-  {
-      llama.params.temp = default_params.temp;
+  if (!body["n_keep"].is_null()) {
+    llama.params.n_keep = body["n_keep"].get<int>();
+  } else {
+    llama.params.n_keep = default_params.n_keep;
   }
-  if (!body["repeat_penalty"].is_null())
-  {
-      llama.params.repeat_penalty = body["repeat_penalty"].get<float>();
-  }
-  else
-  {
-      llama.params.repeat_penalty = default_params.repeat_penalty;
-  }
-  if (!body["presence_penalty"].is_null())
-  {
-      llama.params.presence_penalty = body["presence_penalty"].get<float>();
-  }
-  else
-  {
-      llama.params.presence_penalty = default_params.presence_penalty;
-  }
-  if (!body["frequency_penalty"].is_null())
-  {
-      llama.params.frequency_penalty = body["frequency_penalty"].get<float>();
-  }
-  else
-  {
-      llama.params.frequency_penalty = default_params.frequency_penalty;
-  }
-  if (!body["mirostat"].is_null())
-  {
-      llama.params.mirostat = body["mirostat"].get<float>();
-  }
-  else
-  {
-      llama.params.mirostat = default_params.mirostat;
-  }
-  if (!body["mirostat_tau"].is_null())
-  {
-      llama.params.mirostat_tau = body["mirostat_tau"].get<float>();
-  }
-  else
-  {
-      llama.params.mirostat_tau = default_params.mirostat_tau;
-  }
-  if (!body["mirostat_eta"].is_null())
-  {
-      llama.params.mirostat_eta = body["mirostat_eta"].get<float>();
-  }
-  else
-  {
-      llama.params.mirostat_eta = default_params.mirostat_eta;
-  }
-  if (!body["penalize_nl"].is_null())
-  {
-      llama.params.penalize_nl = body["penalize_nl"].get<float>();
-  }
-  else
-  {
-      llama.params.penalize_nl = default_params.penalize_nl;
-  }
-  if (!body["n_keep"].is_null())
-  {
-      llama.params.n_keep = body["n_keep"].get<int>();
-  }
-  else
-  {
-      llama.params.n_keep = default_params.n_keep;
-  }
-  if (!body["seed"].is_null())
-  {
+  if (!body["seed"].is_null()) {
     llama.params.seed = body["seed"].get<int>();
-  }
-  else
-  {
+  } else {
     llama.params.seed = time(NULL);
   }
-  if (!body["ignore_eos"].is_null() && body["ignore_eos"].get<bool>())
-  {
-      llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+  if (!body["ignore_eos"].is_null() && body["ignore_eos"].get<bool>()) {
+    llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+  } else {
+    llama.params.logit_bias.erase(llama_token_eos());
   }
-  else
-  {
-      llama.params.logit_bias.erase(llama_token_eos());
-  }
-  if (!body["prompt"].is_null())
-  {
+  if (!body["prompt"].is_null()) {
     llama.params.prompt = body["prompt"].get<std::string>();
-  }
-  else
-  {
-    json data = {
-        {"status", "error"},
-        {"reason", "You need to pass the prompt"}};
+  } else {
+    json data = {{"status", "error"}, {"reason", "You need to pass the prompt"}};
     res.set_content(data.dump(llama.json_indent), "application/json");
     res.status = 400;
     return false;
   }
-  if (!body["stop"].is_null())
-  {
+  if (!body["stop"].is_null()) {
     llama.params.antiprompt = body["stop"].get<std::vector<std::string>>();
-  }
-  else
-  {
-      llama.params.antiprompt.clear();
+  } else {
+    llama.params.antiprompt.clear();
   }
 
   if (llama.verbose) {
-      std::string tmp_stop =
-          std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(),
-                          std::string{}, [](std::string a, std::string b) {
-                              return a + (a != "" ? ", \"" : "\"") + b + "\"";
-                          });
+    std::string tmp_stop =
+        std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(),
+                        std::string{}, [](std::string a, std::string b) {
+                            return a + (a != "" ? ", \"" : "\"") + b + "\"";
+                        });
 
-      fprintf(stderr,
-              "-------------------------\n"
-              "/completion parameters: {\n"
-              "    stream: %d,\n"
-              "    frequency_penalty: %f,\n"
-              "    mirostat: %d,\n"
-              "    mirostat_eta: %f,\n"
-              "    mirostat_tau: %f,\n"
-              "    n_keep: %d,\n"
-              "    n_predict: %d,\n"
-              "    penalize_nl: %d,\n"
-              "    presence_penalty: %f,\n"
-              "    repeat_last_n: %d,\n"
-              "    repeat_penalty: %f,\n"
-              "    seed: %d,\n"
-              "    stop: [%s],\n"
-              "    temperature: %f,\n"
-              "    tfs_z: %f,\n"
-              "    top_k: %d,\n"
-              "    top_p: %f,\n"
-              "    typical_p: %f,\n"
-              "}\nPROMPT[%s]\n",
-              llama.stream, llama.params.frequency_penalty, llama.params.mirostat,
-              llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep,
-              llama.params.n_predict, llama.params.penalize_nl,
-              llama.params.presence_penalty, llama.params.repeat_last_n,
-              llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(),
-              llama.params.temp, llama.params.tfs_z, llama.params.top_k,
-              llama.params.top_p, llama.params.typical_p, llama.params.prompt.c_str());
+    fprintf(stderr,
+            "-------------------------\n"
+            "/completion parameters: {\n"
+            "    stream: %d,\n"
+            "    ignore_eos: %d,\n"
+            "    frequency_penalty: %f,\n"
+            "    mirostat: %d,\n"
+            "    mirostat_eta: %f,\n"
+            "    mirostat_tau: %f,\n"
+            "    n_keep: %d,\n"
+            "    n_predict: %d,\n"
+            "    penalize_nl: %d,\n"
+            "    presence_penalty: %f,\n"
+            "    repeat_last_n: %d,\n"
+            "    repeat_penalty: %f,\n"
+            "    seed: %d,\n"
+            "    stop: [%s],\n"
+            "    temperature: %f,\n"
+            "    tfs_z: %f,\n"
+            "    top_k: %d,\n"
+            "    top_p: %f,\n"
+            "    typical_p: %f,\n"
+            "}\nPROMPT[%s]\n",
+            llama.stream, -INFINITY == llama.params.logit_bias[llama_token_eos()],
+            llama.params.frequency_penalty, llama.params.mirostat,
+            llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep,
+            llama.params.n_predict, llama.params.penalize_nl,
+            llama.params.presence_penalty, llama.params.repeat_last_n,
+            llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(),
+            llama.params.temp, llama.params.tfs_z, llama.params.top_k, llama.params.top_p,
+            llama.params.typical_p, llama.params.prompt.c_str());
   }
 
   return true;

From 3edaf6bd8bdc853f7f0a10f9e397bd01d0e99238 Mon Sep 17 00:00:00 2001
From: anon <anon@example.org>
Date: Wed, 31 May 2023 12:55:19 -0300
Subject: [PATCH 10/10] print timings by default

---
 examples/server/server.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index eb75ab1de..d6fb84cd9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -796,9 +796,7 @@ int main(int argc, char **argv)
                        {"prompt", llama.params.prompt},
                        {"stopping_word", llama.stopping_word}};
 
-          if (llama.verbose) {
-              llama_print_timings(llama.ctx);
-          }
+          llama_print_timings(llama.ctx);
 
           return res.set_content(
               data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
@@ -870,16 +868,13 @@ int main(int argc, char **argv)
                   if (!sink.write(str.data(), str.size())) {
                       if (llama.verbose) {
                           fprintf(stderr, "stream closed\n");
-                          llama_print_timings(llama.ctx);
                       }
+                      llama_print_timings(llama.ctx);
                       return false;
                   }
               }
 
-              if (llama.verbose) {
-                  llama_print_timings(llama.ctx);
-              }
-
+              llama_print_timings(llama.ctx);
               sink.done();
               return true;
           };