From 74d13ef3358e6e2407650e3890ff592fc49c3f87 Mon Sep 17 00:00:00 2001
From: pudepiedj <pudepiedj@gmail.com>
Date: Mon, 26 Feb 2024 12:09:06 +0000
Subject: [PATCH] Server updates

---
 Llamaserver.py                 |   5 +-
 apikeys.txt                    |   3 +
 cmakescript.sh                 |   2 +
 examples/server/CMakeLists.txt |   1 +
 examples/server/server.cpp     | 109 +++++++++++++++++++--------------
 examples/server/utils.hpp      |  24 +++-----
 6 files changed, 81 insertions(+), 63 deletions(-)
 create mode 100644 apikeys.txt

diff --git a/Llamaserver.py b/Llamaserver.py
index 3e02d7721..0342a22d0 100644
--- a/Llamaserver.py
+++ b/Llamaserver.py
@@ -104,10 +104,13 @@ if __name__ == "__main__":
 
     bar = make_empty_bar(num_requests)
 
+    api_key = input("What is your API key? ",)
+
     headers = {
         'Content-Type': 'application/json',
         'Accept': 'application/json',  
-        'User-Agent': 'Llamaserver.py'
+        'User-Agent': 'Llamaserver.py',
+        'Authorization': f'Bearer {api_key}'
         }
 
     country_list = ["France", "Germany", "China", "USA", "Italy", "India",
diff --git a/apikeys.txt b/apikeys.txt
new file mode 100644
index 000000000..441abe9f3
--- /dev/null
+++ b/apikeys.txt
@@ -0,0 +1,3 @@
+john123456
+susan987654
+guestabcdef
diff --git a/cmakescript.sh b/cmakescript.sh
index 6d77f9446..70e4eece4 100755
--- a/cmakescript.sh
+++ b/cmakescript.sh
@@ -11,3 +11,5 @@ else
   echo "Usage: $0 (Debug|Release)"
   exit 1
 fi
+
+cd build
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index cc13b2d63..965a3cb68 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
 add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e935b8447..fc4f36e93 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -37,7 +37,7 @@ using json = nlohmann::json;
 
 struct server_params
 {
-    std::string hostname = "0.0.0.0"; // 127.0.0.1 restricts to localhost only; use 0.0.0.0 for local network.
+    std::string hostname = "127.0.0.1";   // --host switches to use 0.0.0.0 for public network.
     std::vector<std::string> api_keys;
     std::string public_path = "examples/server/public";
     std::string chat_template = "";
@@ -307,12 +307,12 @@ struct llama_client_slot
         if (flag) {
             printf("\033[5;0H");        // needs to be sensitive to the number of slots
         };
-        LOG_TEE("Finished processing slot %d.\n", slot.id);
-        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+        LOG("Finished processing slot %d.\n", slot.id);
+        LOG("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
-        LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+        LOG("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
-        LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
+        LOG("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
 
         if (flag) {
             printf("\033[KPress any key ... \n");
@@ -347,7 +347,7 @@ static void kvgraphics(std::vector<llama_client_slot>& slots) {
     // See eblow for a rethink because controlling log printing is such a pain in C++11
     // Only clear the screen the first time round
     if (cls_flag) {
-        // printf("\033[2J");
+        printf("\033[2J");
         cls_flag = false;
     }
     printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n");
@@ -355,7 +355,7 @@ static void kvgraphics(std::vector<llama_client_slot>& slots) {
     // we can know and control how many lines of output we are printing so just start below that and fix the graphics location
     printf("\033[%d;0H", 10);
     for(int i=0; i<num_blocks; i++) {
-        printf("\033[K");  // clear the current line
+        //printf("\033[K");  // clear the current line
         for(int j=0; j < max_length; j++) {
             int used = slots[i].cache_tokens.size() * max_length / slot_cache_size;
             if((j < max_length / 2) && (j < used)) {
@@ -446,7 +446,7 @@ struct llama_server_context
         params = params_;
         if (!params.mmproj.empty()) {
             multimodal = true;
-            LOG_TEE("Multi Modal Mode Enabled");
+            LOG("Multi Modal Mode Enabled");
             clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
             if(clp_ctx == nullptr) {
                 LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@@ -469,7 +469,7 @@ struct llama_server_context
             const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
             const int n_embd_llm  = llama_n_embd(model);
             if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                 llama_free(ctx);
                 llama_free_model(model);
                 return false;
@@ -508,7 +508,7 @@ struct llama_server_context
                 GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
                 //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                 //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-                LOG_TEE(" -> Slot %2i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
+                LOG(" -> Slot %2i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
             }
 
             slot.ga_i = 0;
@@ -588,13 +588,13 @@ struct llama_server_context
             printf("\033[5;0H");
             if (slot.id == -1 && slot.available())
             {
-                LOG_TEE("Unallocated task now using slot %d", slot.id);
+                LOG("Unallocated task now using slot %d", slot.id);
                 return &slot;
             }
 
             if (slot.id == id && slot.available())
             {
-                LOG_TEE("Using id-based available slot called by id: %d\n", slot.id);
+                LOG("Using id-based available slot called by id: %d\n", slot.id);
                 return &slot;
             }
 
@@ -816,10 +816,10 @@ struct llama_server_context
                     img_sl.img_data = clip_image_u8_init();
                     if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                     {
-                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
+                        LOG("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
                         return false;
                     }
-                    LOG_TEE("slot %i - loaded image\n", slot->id);
+                    LOG("slot %i - loaded image\n", slot->id);
                     img_sl.request_encode_image = true;
                     slot->images.push_back(img_sl);
                 }
@@ -851,12 +851,12 @@ struct llama_server_context
                                     }
                                 }
                                 if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
                                     slot->images.clear();
                                     return false;
                                 }
                             } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
+                                LOG("Invalid image number id in prompt\n");
                                 slot->images.clear();
                                 return false;
                             }
@@ -879,7 +879,7 @@ struct llama_server_context
 
         all_slots_are_idle = false;
 
-        LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+        LOG("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
 
         return true;
     }
@@ -906,7 +906,7 @@ struct llama_server_context
 
             if (llama_decode(ctx, batch) != 0)
             {
-                LOG_TEE("%s: llama_decode() failed\n", __func__);
+                LOG("%s: llama_decode() failed\n", __func__);
                 return;
             }
 
@@ -917,7 +917,7 @@ struct llama_server_context
             }
         }
 
-        LOG_TEE("system prompt updated\n");
+        LOG("system prompt updated\n");
         system_need_update = false;
     }
 
@@ -1097,7 +1097,7 @@ struct llama_server_context
             }
 
             if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
+                LOG("Error processing the given image");
                 return false;
             }
 
@@ -1110,7 +1110,7 @@ struct llama_server_context
 
     void send_error(task_server& task, const std::string &error)
     {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+        LOG("task %i - error: %s\n", task.id, error.c_str());
         task_result res;
         res.id = task.id;
         res.multitask_id = task.multitask_id;
@@ -1353,7 +1353,7 @@ struct llama_server_context
                 };
                 if (llama_decode(ctx, batch_view))
                 {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG("%s : failed to eval\n", __func__);
                     return false;
                 }
             }
@@ -1371,7 +1371,7 @@ struct llama_server_context
                 llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
                 if (llama_decode(ctx, batch_img))
                 {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    LOG("%s : failed to eval image\n", __func__);
                     return false;
                 }
                 slot.n_past += n_eval;
@@ -1444,19 +1444,19 @@ struct llama_server_context
                 // because if it doesnt the returned value will be -1; what makes it anything else?
                 int requested_slot = json_value(task.data, "slot_id", -1);
                 printf("\033[5;0H\033[K");
-                LOG_TEE("Task %d requesting slot %d\n", task.id, requested_slot);
+                LOG("Task %d requesting slot %d\n", task.id, requested_slot);
 
                 // why are we suddenly using 'slot' as a pointer here - confusing?
                 llama_client_slot *slot = get_slot(requested_slot); // returns nullptr if no slot available
                 if (slot == nullptr)
                 {
                     // if no slot is available, we defer this task for processing later
-                    LOG_TEE("no slot is available for task %d\n", task.id);
+                    LOG("no slot is available for task %d\n", task.id);
                     queue_tasks.defer(task);
                     break;
                 } else {
                     printf("\033[5;0H\033[K");
-                    LOG_TEE("Activating slot %d.\n", (*slot).id);
+                    LOG("Activating slot %d.\n", (*slot).id);
                 }
 
                 if (task.data.contains("system_prompt"))
@@ -1528,7 +1528,7 @@ struct llama_server_context
     bool update_slots() {
         if (system_need_update)
         {
-            LOG_TEE("updating system prompt\n");
+            LOG("updating system prompt\n");
             update_system_prompt();
         }
 
@@ -1538,7 +1538,7 @@ struct llama_server_context
         {
             if (system_prompt.empty() && clean_kv_cache)
             {
-                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
+                LOG("all slots are idle and system prompt is empty; clearing the KV cache\n");
                 kv_cache_clear();
             }
             return true;
@@ -1562,7 +1562,7 @@ struct llama_server_context
                     const int n_discard = n_left / 2;       // is this arbitrary?
 
                     printf("\033[5;0H\033[K");
-                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
+                    LOG("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
                     llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
                     llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
 
@@ -1597,7 +1597,7 @@ struct llama_server_context
                 slot.t_last_used = ggml_time_us();
 
                 printf("\033[6;0H\033[K");
-                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                LOG("slot %d released (%d tokens remain in cache)\n", slot.id, (int) slot.cache_tokens.size());
                 queue_tasks.notify_slot_changed();  // why don't we immediately reallocate the released slot without waiting? Is this what -cb does?
 
                 continue;
@@ -1746,7 +1746,7 @@ struct llama_server_context
                         }
 
                         printf("\033[7;0H\033[K");
-                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+                        LOG("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                     }
 
                     slot.cache_tokens = prompt_tokens;
@@ -1754,7 +1754,7 @@ struct llama_server_context
                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                     {
                         // we have to evaluate at least 1 token to generate logits.
-                        LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
+                        LOG("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
                         slot.n_past--;
                         if (slot.ga_i > 0)
                         {
@@ -1763,7 +1763,7 @@ struct llama_server_context
                     }
 
                     printf("\033[5;0H\033[K");
-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+                    LOG("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
 
                     llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
 
@@ -1800,7 +1800,7 @@ struct llama_server_context
 
                     if (has_images && !ingest_images(slot, n_batch))
                     {
-                        LOG_TEE("failed processing images\n");
+                        LOG("failed processing images\n");
                         return false;
                     }
 
@@ -1837,10 +1837,10 @@ struct llama_server_context
                         const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                         const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
 
-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG("\n");
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
 
                         llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                         llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1850,7 +1850,7 @@ struct llama_server_context
 
                         slot.ga_i += slot.ga_w / slot.ga_n;
 
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                     }
                     slot.n_past_se += n_tokens;
                 }
@@ -1877,12 +1877,12 @@ struct llama_server_context
                 if (n_batch == 1 || ret < 0)
                 {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                     return false;
                 }
 
                 // we get here if ret = 1 and n_batch != 1
-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
 
                 // retry with half the batch size to try to find a free slot in the KV cache
                 n_batch /= 2;
@@ -2823,11 +2823,14 @@ int main(int argc, char **argv)
     log_data["hostname"] = sparams.hostname;
     log_data["port"] = std::to_string(sparams.port);
 
-    if (sparams.api_keys.size() == 1) {
+    if (sparams.api_keys.size() == 1) {     // what happens if the size is zero?
         log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
     } else if (sparams.api_keys.size() > 1) {
         log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
     }
+    for (int i=0; i<int(sparams.api_keys.size()); i++) {
+        LOG_TEE("Loaded api key #%d: %s\n", i, sparams.api_keys[i].c_str());
+    }
 
     LOG_INFO("HTTP server listening", log_data);
     // run the HTTP server in a thread - see comment below
@@ -2856,7 +2859,7 @@ int main(int argc, char **argv)
 
     // Middleware for API key validation
     auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
-        // If API key is not set, skip validation
+        // If API key is not set, because the file is empty, skip validation
         if (sparams.api_keys.empty()) {
             return true;
         }
@@ -2866,9 +2869,21 @@ int main(int argc, char **argv)
         std::string prefix = "Bearer ";
         if (auth_header.substr(0, prefix.size()) == prefix) {
             std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
-                return true; // API key is valid
+            LOG("Received API key = %s\n", received_api_key.c_str());
+            for (int i = 0; i < int(sparams.api_keys.size()); i++) {
+                // for some reason the file apikeys are one character longer than those passed from Bearer so we shorten them
+                std::string uncut_api = sparams.api_keys[i]; // store original apikey
+                std::string cut_api = uncut_api.substr(0, uncut_api.size() - 1);    // do not shorten in-place by using erase
+                if (received_api_key != cut_api) {
+                    LOG("%s != %s and length left = %zu, length right = %zu\n", received_api_key.c_str(), cut_api.c_str(),received_api_key.size(), cut_api.size());
+                } else if (received_api_key == cut_api) {
+                    LOG("%s = %s FOUND IT!!!\n", received_api_key.c_str(), cut_api.c_str());
+                    return true;
+                }
             }
+            //if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
+            //    return true; // API key is valid
+            //}
         }
 
         // API key is invalid or not provided
@@ -2928,14 +2943,14 @@ int main(int argc, char **argv)
                 }
                 // it appears that here we first get ONE request to parse; then TEN; then ONE-by-ONE
                 printf("\033[5;0H\033[K");
-                LOG_TEE("Request body to parse: %s", req.body.c_str());
+                LOG_TEE("Request body to parse: %s.\n", req.body.c_str());
                 if (llama.skvinteract) {
                     getchar();
                 }
                 json data = json::parse(req.body);
                 const int task_id = llama.queue_tasks.get_new_id();         // just returns a new id number
                 llama.queue_results.add_waiting_task_id(task_id);
-                LOG_TEE("Initiated new task %d.\n", task_id);
+                LOG("Initiated new task %d.\n", task_id);
                 llama.request_completion(task_id, data, false, false, -1);
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;    // is this ever used?
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 1aa45f28c..84822c1fd 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -151,7 +151,7 @@ static inline void server_log(const char *level, const char *function, int line,
     }
 
     const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
-    printf("%.*s\n", (int)str.size(), str.data());
+    LOG("%.*s\n", (int)str.size(), str.data());
     fflush(stdout);
 }
 
@@ -236,7 +236,7 @@ struct llama_server_queue {
             task.id = id;   // originally id++ but this repeats get_new_id below
         }
         queue_tasks.push_back(std::move(task));
-        //LOG_TEE("Queue now has %2zu members.\n", queue_tasks.size());
+        //LOG("Queue now has %2zu members.\n", queue_tasks.size());
         condition_tasks.notify_one();
         return task.id;
     }
@@ -245,15 +245,13 @@ struct llama_server_queue {
     void defer(task_server task) {
         std::unique_lock<std::mutex> lock(mutex_tasks);
         queue_tasks_deferred.push_back(std::move(task));
-        printf("\033[1;50H*** ");
-        LOG_TEE("Deferred queue now has %3zu members.\n", queue_tasks_deferred.size());
-        printf("\033[5;0H");
+        LOG("Deferred queue now has %3zu members.\n", queue_tasks_deferred.size());
     }
 
     // Get the next id for creating a new task
     int get_new_id() {
         std::unique_lock<std::mutex> lock(mutex_tasks);
-        LOG_TEE("New task id returned with value %d.\n", id);
+        LOG("New task id returned with value %d.\n", id);
         return id++;
     }
 
@@ -297,10 +295,10 @@ struct llama_server_queue {
     // Start the main loop. Called from the very end of server.cpp
     void start_loop() {
         running = true;
-        //LOG_TEE("In start_loop have new task number %d.\n", id);
+        //LOG("In start_loop have new task number %d.\n", id);
         while (true) {
             // new task arrived
-            // LOG_TEE("In start_loop have new task number %d.\n", id);
+            // LOG("In start_loop have new task number %d.\n", id);
             {
                 while (true)
                 {
@@ -393,17 +391,13 @@ struct llama_server_response {
     void add_waiting_task_id(int task_id) {
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.insert(task_id);
-        printf("\033[1;50H*** ");
-        LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size());
-        printf("\033[5;0H");
+        LOG("Waiting task list size after addition: %2zu.\n", waiting_task_ids.size());
     }
 
     void remove_waiting_task_id(int task_id) {
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.erase(task_id);
-        printf("\033[2;50H*** ");
-        LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
-        printf("\033[5;0H");
+        LOG("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
     }
 
     // This function blocks the thread until there is a response for this task_id
@@ -441,7 +435,7 @@ struct llama_server_response {
         std::unique_lock<std::mutex> lock(mutex_results);
         LOG_VERBOSE("send new result", {});
         for (auto& task_id : waiting_task_ids) {
-            // LOG_TEE("waiting task id %i \n", task_id);
+            // LOG("waiting task id %i \n", task_id);
             // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
             if (result.multitask_id == task_id)
             {