diff --git a/Llamaserver.py b/Llamaserver.py
new file mode 100644
index 000000000..2b47b86f1
--- /dev/null
+++ b/Llamaserver.py
@@ -0,0 +1,110 @@
+from queue import Queue
+import threading
+import requests
+import json
+
+def print_dict(data):
+    for k, v in data.items():
+        if isinstance(v, dict):
+            print_dict(v)
+        elif isinstance(v, list):
+            for entry in v:
+                print_dict(entry)
+        elif k == "content":          
+            print(f"Key: {k:>30}: {v}")
+    return
+
+def print_response(text):
+    print(text)
+
+def make_empty_bar(num_requests):
+    bar = []
+    for i in range(num_requests):
+        bar.append("\u2589")
+    bar = ' '.join(bar)
+    bar = bar.replace(' ','')
+    print(f"Bar is now {bar}.")
+    return bar
+
+def make_progress_bar(bar, count, num_requests):
+    stride1 = len("\u2589")
+    stride2 = len("\u23F1")
+    for i in range(num_requests):
+        if i == count:
+            print(f"Bar position {i} is {bar[i]}")
+            bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
+    print(f"Bar is now {bar}")
+    return bar
+
+def send_request(q, question, event, count, num_requests):
+
+    global bar
+
+    data = {'prompt': question}
+    
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        if response.status_code in [200,300]:
+            print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
+            print(f"Status Code for {question}: {response.status_code}")
+            print(f"Response to {question}:\n")
+            print_dict(json.loads(response.text))
+            # put the response text in the queue
+            q.put(response.text)
+            if not q.empty():
+                print(f"Completed task {count} / {num_requests}\n")
+                bar = make_progress_bar(bar, count, num_requests)
+            q.task_done()
+        elif response.status_code == 429 and not q.empty():
+            event.set()
+            print("Server return too many requests; back off!! Reset event.")
+    except Exception as e:
+        print(f"Server returned exception error {e}")
+
+if __name__ == "__main__":
+
+    global bar
+    
+    url = "http://localhost:8080/completion"
+
+    num_requests = 40
+    q = Queue(maxsize = 40)
+    threads = []
+
+    bar = make_empty_bar(num_requests)
+
+    headers = {
+        'Content-Type': 'application/json',
+        'Accept': 'application/json',  
+        'User-Agent': 'Llamaserver.py'
+        }
+
+    country_list = ["France", "Germany", "China", "USA", "Italy", "India",
+                    "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
+                    "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
+                    "Bulgaria", "Romania", "Finland", "Sweden", "Norway", "Denmark", "Tanzania", "Israel",
+                    "Latvia", "Lithuania", "Estonia", "Pakistan", "Sri Lanka", "Malawi", "Mozambique"]
+    
+    for i in range(num_requests):
+        country = country_list[i % len(country_list)]
+        question = f"When was the first democratic election (if any) in {country}?"
+        # NOTE: don't pass the parameter as a function call; pass in args
+        print(f"Processing request {i} / {num_requests}: {question}\n")
+        event = threading.Event()
+        t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) 
+        t.start()
+        threads.append(t)
+        # input("Any key",)
+
+    for thread in threads:
+        thread.join()   # wait for all threads to finish
+
+    print("FINISHED AND GETTING RESULTS")
+    while not q.empty():
+        text = q.get()  
+        print_dict(json.loads(text))
+
+
+    
+
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index fa127a3aa..13db64e9f 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -17,6 +17,7 @@ else()
     add_subdirectory(batched-bench)
     add_subdirectory(beam-search)
     add_subdirectory(benchmark)
+    add_subdirectory(cmap-example)
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(embedding)
     add_subdirectory(finetune)
diff --git a/examples/cmap-example/CMakeLists.txt b/examples/cmap-example/CMakeLists.txt
new file mode 100644
index 000000000..6298b2c7e
--- /dev/null
+++ b/examples/cmap-example/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET kvcacheviz)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} kvcacheviz.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/cmap-example/KVcacheViz.py b/examples/cmap-example/KVcacheViz.py
new file mode 100644
index 000000000..dcfe0c0e5
--- /dev/null
+++ b/examples/cmap-example/KVcacheViz.py
@@ -0,0 +1,29 @@
+# A simple illustration of how to represent cache occupancy
+# graphically using unicvode blocks
+# which are generated using print("\u2588"), print("\u2591")
+
+from time import sleep
+import random
+
+CACHE_SIZE = 50 
+used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
+
+def visualize_kv_cache(used_blocks, total_size):
+    cache_viz = "["
+    tot_used = 0
+    for i in range(len(used_blocks)):
+        # cache_viz += "█" * used_blocks[i]
+        cache_viz += "\u2589" * used_blocks[i]
+        cache_viz += "░" * (total_size - used_blocks[i])
+        cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
+        tot_used += used_blocks[i]
+
+        #print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
+
+    print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
+    
+
+while True:
+    visualize_kv_cache(used_blocks, CACHE_SIZE)
+    sleep(0.5)
+    used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks
diff --git a/examples/cmap-example/kvcacheviz.cpp b/examples/cmap-example/kvcacheviz.cpp
new file mode 100644
index 000000000..607b7f323
--- /dev/null
+++ b/examples/cmap-example/kvcacheviz.cpp
@@ -0,0 +1,68 @@
+/*
+A utility to represent the kv-cache occupancy graphically
+Takes as parameters
+- total cache size (-c)
+- number of simultaneous accesses/slots (-np)
+- a parameter related to the display context (max window width - data display requirements)
+It then uses a trick borrowed from tqdm to display occupancy
+TODO: Show contiguous space and block availability
+*/
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cstdlib> // for rand()
+
+static void show_kvcache(
+  std::vector<int> used_blocks,
+  int cache_size,
+  int max_length
+) {
+  int num_blocks = used_blocks.size();
+  int slot_cache_size = cache_size / num_blocks;
+
+  while(true) {
+
+    // Print visualization after erasing the current line
+    for(int i=0; i<num_blocks; i++) {
+      for(int j=0; j<max_length; j++) {
+        if(j<used_blocks[i] * max_length / slot_cache_size) {
+          std::cout << "\033[94m█\033[0m";
+        }
+        //else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
+        //  std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
+        //}
+        else {
+          std::cout << "\033[91m█\033[0m";
+        }
+      }
+    std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
+    }
+  std::cout << "{";
+  std::string upcursor = "\033[K\033[A\033[K";
+
+  for(int i=0; i < num_blocks; i++){
+    //std::cout << used_blocks[i] << " ";
+    upcursor += "\033[A\033[K";
+  }
+
+  // Remove first element
+  used_blocks.erase(used_blocks.begin());
+
+  // Add new random block at the end
+  u_int new_block = rand() % slot_cache_size;
+  used_blocks.push_back(new_block);
+
+// Adjust the cursor so that the display overwrites itself
+  upcursor += "\033[A\033[K";
+  std::cout << "}" << std::endl;
+  std::cin.get();
+  std::cout << upcursor;
+  }
+}
+
+int main() {
+  std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
+  int cache_size = 65536;
+  int max_length = 128;
+  show_kvcache(used_blocks, cache_size, max_length);
+  }
diff --git a/examples/main/Adjustmain.py b/examples/main/Adjustmain.py
new file mode 100644
index 000000000..8f3473bb1
--- /dev/null
+++ b/examples/main/Adjustmain.py
@@ -0,0 +1,17 @@
+# Adjust the main.cpp file
+# to include the (Tokens used:) data output
+
+try:
+    with open("/Users/edsilm2/llama.cpp/examples/main/main.cpp", 'r+') as file:
+        main = file.read()
+        search_str = 'printf("\\n> ");'
+        new_str = 'printf("\\033[31m(Tokens used: %d / %d)\\033[0m\\nJCP: ", n_past, n_ctx);'
+        main = main.replace(search_str, new_str)
+        file.seek(0)
+        search_str = 'context full and n_predict == -%d => stopping'
+        new_str = 'context full and n_predict == %d => stopping'
+        main = main.replace(search_str, new_str)
+        file.seek(0)
+        file.write(main)
+except FileNotFoundError as fe:
+    print(f"Error searching for main.cpp: {fe}")
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c53b29978..45983593a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -522,7 +522,7 @@ int main(int argc, char ** argv) {
                 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
                 if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                     if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        LOG_TEE("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
                         break;
                     }
 
@@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
                 LOG("waiting for user input\n");
 
                 if (params.instruct || params.chatml) {
-                    printf("\n> ");
+                    printf("\033[31m(Tokens used: %d / %d)\033[0m\nJCP: ", n_past, n_ctx);
                 }
 
                 if (params.input_prefix_bos) {
diff --git a/examples/server/httplib.h b/examples/server/httplib.h
index 28746000c..4f08c3df9 100644
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
@@ -19,7 +19,7 @@
 #endif
 
 #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
-#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
+#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5   // originally 5
 #endif
 
 #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
@@ -110,7 +110,7 @@
 #endif
 
 #ifndef CPPHTTPLIB_LISTEN_BACKLOG
-#define CPPHTTPLIB_LISTEN_BACKLOG 5
+#define CPPHTTPLIB_LISTEN_BACKLOG 15    // originally 5
 #endif
 
 /*
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1d30a15a6..aa9a3b991 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,11 @@
 #include <condition_variable>
 #include <atomic>
 
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cstdlib> // for rand()
+
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
@@ -61,29 +66,28 @@ static bool server_verbose = false;
     } while (0)
 #endif
 
-#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 
 json oaicompat_completion_params_parse(const json &body);
 std::string format_chatml(std::vector<json> messages);
 
-
 //
 // base64 utils (TODO: move to common in the future)
 //
 
 static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz"
+    "0123456789+/";
 
 static inline bool is_base64(uint8_t c)
 {
     return (isalnum(c) || (c == '+') || (c == '/'));
 }
 
-static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
+static std::vector<uint8_t> base64_decode(const std::string &encoded_string)
 {
     int i = 0;
     int j = 0;
@@ -98,17 +102,18 @@ static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 
     while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
     {
-        char_array_4[i++] = encoded_string[in_]; in_++;
+        char_array_4[i++] = encoded_string[in_];
+        in_++;
         if (i == 4)
         {
-            for (i = 0; i <4; i++)
+            for (i = 0; i < 4; i++)
             {
                 char_array_4[i] = base64_chars.find(char_array_4[i]);
             }
 
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[0] = ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4);
             char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 
             for (i = 0; (i < 3); i++)
             {
@@ -120,19 +125,19 @@ static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 
     if (i)
     {
-        for (j = i; j <4; j++)
+        for (j = i; j < 4; j++)
         {
             char_array_4[j] = 0;
         }
 
-        for (j = 0; j <4; j++)
+        for (j = 0; j < 4; j++)
         {
             char_array_4[j] = base64_chars.find(char_array_4[j]);
         }
 
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[0] = ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4);
         char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
 
         for (j = 0; (j < i - 1); j++)
         {
@@ -147,18 +152,21 @@ static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 // parallel
 //
 
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
+enum server_state
+{
+    SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,         // Server is ready and model is loaded
+    SERVER_STATE_ERROR          // An error occurred, load_model failed
 };
 
-enum task_type {
+enum task_type
+{
     TASK_TYPE_COMPLETION,
     TASK_TYPE_CANCEL,
 };
 
-struct task_server {
+struct task_server
+{
     int id;
     int target_id;
     task_type type;
@@ -168,7 +176,8 @@ struct task_server {
     int multitask_id = -1;
 };
 
-struct task_result {
+struct task_result
+{
     int id;
     int multitask_id = -1;
     bool stop;
@@ -176,13 +185,14 @@ struct task_result {
     json result_json;
 };
 
-struct task_multi {
+struct task_multi
+{
     int id;
     std::set<int> subtasks_remaining{};
     std::vector<task_result> results{};
 };
 
-// TODO: can become bool if we can't find use of more states
+// TODO: can become bool if we can't find use of more states (Could include PAUSED for cases where things jam)
 enum slot_state
 {
     IDLE,
@@ -198,12 +208,12 @@ enum slot_command
 
 struct slot_params
 {
-    bool stream       = true;
+    bool stream = true;
     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
 
-    uint32_t seed      = -1; // RNG seed
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_predict = -1; // new tokens to predict
+    uint32_t seed = -1;     // RNG seed
+    int32_t n_keep = 0;     // number of tokens to keep from initial prompt
+    int32_t n_predict = -1; // new tokens to predict
 
     std::vector<std::string> antiprompt;
 
@@ -216,10 +226,10 @@ struct slot_image
     int32_t id;
 
     bool request_encode_image = false;
-    float * image_embedding = nullptr;
+    float *image_embedding = nullptr;
     int32_t image_tokens = 0;
 
-    clip_image_u8 * img_data;
+    clip_image_u8 *img_data;
 
     std::string prefix_prompt; // before of this image
 };
@@ -295,13 +305,12 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 static void server_log(const char *level, const char *function, int line,
                        const char *message, const nlohmann::ordered_json &extra)
 {
-    nlohmann::ordered_json log
-    {
+    nlohmann::ordered_json log{
         {"timestamp", time(nullptr)},
-        {"level",     level},
-        {"function",  function},
-        {"line",      line},
-        {"message",   message},
+        {"level", level},
+        {"function", function},
+        {"line", line},
+        {"message", message},
     };
 
     if (!extra.empty())
@@ -310,7 +319,7 @@ static void server_log(const char *level, const char *function, int line,
     }
 
     const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
-    printf("%.*s\n", (int)str.size(), str.data());
+    LOG("%.*s\n", (int)str.size(), str.data());
     fflush(stdout);
 }
 
@@ -340,16 +349,15 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
         for (const auto &p : prob.probs)
         {
             std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
+            probs_for_token.push_back(json{
                 {"tok_str", tok_str},
-                {"prob",    p.prob},
+                {"prob", p.prob},
             });
         }
         std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
         out.push_back(json{
             {"content", tok_str},
-            {"probs",   probs_for_token},
+            {"probs", probs_for_token},
         });
     }
     return out;
@@ -360,8 +368,8 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 {
     // Fallback null to default value
     return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
+               ? body.value(key, default_value)
+               : default_value;
 }
 
 struct llama_client_slot
@@ -378,13 +386,13 @@ struct llama_client_slot
     int64_t t_last_used = -1;
 
     // generation props
-    int32_t n_ctx       = 0;  // context size per slot
-    int32_t n_past      = 0;
-    int32_t n_decoded   = 0;
+    int32_t n_ctx = 0; // context size per slot
+    int32_t n_past = 0;
+    int32_t n_decoded = 0;
     int32_t n_remaining = -1;
-    int32_t i_batch     = -1;
+    int32_t i_batch = -1;
 
-    int32_t num_prompt_tokens           = 0;
+    int32_t num_prompt_tokens = 0;
     int32_t num_prompt_tokens_processed = 0;
 
     json prompt;
@@ -421,30 +429,32 @@ struct llama_client_slot
     int64_t t_start_genereration;
 
     double t_prompt_processing; // ms
-    double t_token_generation; // ms
+    double t_token_generation;  // ms
 
     // multitasks
     int multitask_id = -1;
 
-    void reset() {
-        num_prompt_tokens      = 0;
-        generated_text         = "";
-        truncated              = false;
-        stopped_eos            = false;
-        stopped_word           = false;
-        stopped_limit          = false;
-        stopping_word          = "";
-        n_past                 = 0;
-        sent_count             = 0;
+    void reset()
+    {
+        num_prompt_tokens = 0;
+        generated_text = "";
+        truncated = false;
+        stopped_eos = false;
+        stopped_word = false;
+        stopped_limit = false;
+        stopping_word = "";
+        n_past = 0;
+        sent_count = 0;
         sent_token_probs_index = 0;
-        infill                 = false;
+        infill = false;
 
         generated_token_probs.clear();
 
-        for (slot_image & img : images)
+        for (slot_image &img : images)
         {
             free(img.image_embedding);
-            if (img.img_data) {
+            if (img.img_data)
+            {
                 clip_image_u8_free(img.img_data);
             }
             img.prefix_prompt = "";
@@ -453,7 +463,8 @@ struct llama_client_slot
         images.clear();
     }
 
-    bool has_budget(gpt_params &global_params) {
+    bool has_budget(gpt_params &global_params)
+    {
         if (params.n_predict == -1 && global_params.n_predict == -1)
         {
             return true; // limitless
@@ -472,16 +483,19 @@ struct llama_client_slot
 
         return n_remaining > 0; // no budget
     }
-
-    bool available() const {
+    // this means is idle doing nothing and so available
+    bool available() const
+    {
         return state == IDLE && command == NONE;
     }
-
-    bool is_processing() const {
+    // this means the slot is either idle and loading or is processing
+    bool is_processing() const
+    {
         return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
     }
 
-    void add_token_string(const completion_token_output &token) {
+    void add_token_string(const completion_token_output &token)
+    {
         if (command == RELEASE)
         {
             return;
@@ -490,39 +504,104 @@ struct llama_client_slot
         generated_token_probs.push_back(token);
     }
 
-    void release() {
-        if (state == IDLE || state == PROCESSING)
+    void release()
+    {
+        if (state == IDLE || state == PROCESSING)   // Isn't this redundant since slot_state has only two options?
         {
             t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
             command = RELEASE;
         }
     }
 
-    json get_formated_timings() {
-        return json
-        {
-            {"prompt_n",               num_prompt_tokens_processed},
-            {"prompt_ms",              t_prompt_processing},
-            {"prompt_per_token_ms",    t_prompt_processing / num_prompt_tokens_processed},
-            {"prompt_per_second",      1e3 / t_prompt_processing * num_prompt_tokens_processed},
+    json get_formated_timings()
+    {
+        return json{
+            {"prompt_n", num_prompt_tokens_processed},
+            {"prompt_ms", t_prompt_processing},
+            {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed},
+            {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed},
 
-            {"predicted_n",            n_decoded},
-            {"predicted_ms",           t_token_generation},
+            {"predicted_n", n_decoded},
+            {"predicted_ms", t_token_generation},
             {"predicted_per_token_ms", t_token_generation / n_decoded},
-            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
+            {"predicted_per_second", 1e3 / t_token_generation * n_decoded},
         };
     }
 
-    void print_timings() const {
+    void print_timings() const
+    {
         LOG_TEE("\n");
         LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
+                __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
         LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
+                __func__, t_token_generation, n_decoded, t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
         LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
     }
 };
 
+// my custom function to display graphics of the kvcache status
+static void show_kvcache(std::vector<std::pair<int,struct llama_client_slot>> used_blocks, int cache_size) {
+
+    int max_length = 128;
+    int num_blocks = used_blocks.size();
+    int slot_cache_size = cache_size / num_blocks;
+    bool cls_flag = true;
+    std::string slot_symbol1 = "";
+    std::string slot_symbol2 = "";
+    std::string slot_symbol3 = "";
+    auto& p = used_blocks[0];
+    llama_client_slot slot = p.second;
+
+    return; // remove when not in debug mode
+
+    if ((used_blocks.size() == 0) || (used_blocks[0].first == 0)) {
+        return;
+    }
+
+    // Print visualization
+    // Always start at the top left of the window (H means 'move cursor to this position'; 2J = cls)
+    // Only clear the screen the first time round
+    if (cls_flag) {
+        printf("\033[2J");
+        cls_flag = false;
+    }
+    printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n");
+    for(int i=0; i<num_blocks; i++) {
+        printf("\033[K");  // clear the current line
+        for(int j=0; j < max_length; j++) {
+            int used = used_blocks[i].first * max_length / slot_cache_size;
+            if((j < max_length / 2) && (j < used)) {
+                printf("\033[90m█\033[0m");
+            } else if (j < used) {
+                printf("\033[94m█\033[0m");
+            } else {
+                printf("\033[91m█\033[0m");
+            }
+        }
+        if(used_blocks[i].second.state == PROCESSING) {
+            slot_symbol1 = "\u23F0"; // clock symbol = processing
+        } else if(used_blocks[i].second.state == IDLE) {
+            slot_symbol1 = "\u2705"; // red box white tick
+        } else {
+            slot_symbol1 = "\u2620"; // skull and crossbones symbol = dead?
+        }
+        if(used_blocks[i].second.command == LOAD_PROMPT) {
+            slot_symbol2 = "\u24C1"; // dingbat L symbol = loading
+        } else if(used_blocks[i].second.command == RELEASE) {
+            slot_symbol2 = "\u24C7"; // dingbat R release
+        } else if(used_blocks[i].second.command == NONE) {
+            slot_symbol2 = "\u24C3"; // dingbat N none
+        }
+        if(used_blocks[i].first == slot_cache_size) {
+            slot_symbol3 = "\u274E"; // red box white cross
+        } else {
+            slot_symbol3 = "";
+        }
+    printf(" %4d/%5d %2d %s %s %s\n", used_blocks[i].first, slot_cache_size, used_blocks[i].second.id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str());
+    }
+    printf("\n\033[%dJ", 0);
+}
+
 struct llama_server_context
 {
     llama_model *model = nullptr;
@@ -534,21 +613,21 @@ struct llama_server_context
 
     llama_batch batch;
 
-    bool multimodal         = false;
-    bool clean_kv_cache     = true;
+    bool multimodal = false;
+    bool clean_kv_cache = true;
     bool all_slots_are_idle = false;
-    bool add_bos_token      = true;
+    bool add_bos_token = true;
 
     int32_t id_gen;
-    int32_t n_ctx;  // total context for all clients / slots
+    int32_t n_ctx; // total context for all clients divided between slots
 
     // system prompt
     bool system_need_update = false;
 
-    std::string              system_prompt;
+    std::string system_prompt;
     std::vector<llama_token> system_tokens;
 
-    std::string name_user;      // this should be the antiprompt
+    std::string name_user; // this should be the antiprompt
     std::string name_assistant;
 
     // slots / clients
@@ -556,13 +635,13 @@ struct llama_server_context
 
     std::vector<task_server> queue_tasks;
     std::vector<task_result> queue_results;
-    std::vector<task_multi>  queue_multitasks;
+    std::vector<task_multi> queue_multitasks;
     std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
     std::condition_variable condition_tasks;
     std::mutex mutex_results;
     std::condition_variable condition_results;
 
-    ~llama_server_context()
+    ~llama_server_context()     // destructor called automatically whenever the llama_server_context stops/fails
     {
         if (ctx)
         {
@@ -579,16 +658,19 @@ struct llama_server_context
     bool load_model(const gpt_params &params_)
     {
         params = params_;
-        if (!params.mmproj.empty()) {
+        if (!params.mmproj.empty())
+        {
             multimodal = true;
             LOG_TEE("Multi Modal Mode Enabled");
-            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
-            if(clp_ctx == nullptr) {
+            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1);
+            if (clp_ctx == nullptr)
+            {
                 LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
                 return false;
             }
 
-            if (params.n_ctx < 2048) { // request larger context for the image embedding
+            if (params.n_ctx < 2048)
+            { // request larger context for the image embedding
                 params.n_ctx = 2048;
             }
         }
@@ -600,10 +682,12 @@ struct llama_server_context
             return false;
         }
 
-        if (multimodal) {
+        if (multimodal)
+        {
             const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_n_embd(model);
-            if (n_embd_clip != n_embd_llm) {
+            const int n_embd_llm = llama_n_embd(model);
+            if (n_embd_clip != n_embd_llm)
+            {
                 LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                 llama_free(ctx);
                 llama_free_model(model);
@@ -618,35 +702,42 @@ struct llama_server_context
         return true;
     }
 
-    void initialize() {
+    void initialize()
+    {
         id_gen = 0;
 
-        // create slots
+        // create slots: assign each one its id and context size
         all_slots_are_idle = true;
 
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+        const int32_t n_ctx_slot = n_ctx / params.n_parallel;   // can this be made a lazy assignment from number of clients?
 
         LOG_TEE("Available slots:\n");
-        for (int i = 0; i < params.n_parallel; i++)
+        for (int i = 0; i < params.n_parallel; i++)             // this sets up n_parallel slots and works correctly
         {
-            llama_client_slot slot;
+            llama_client_slot slot;                             // create a slot as instance of llama_client_slot
 
-            slot.id = i;
-            slot.n_ctx = n_ctx_slot;
-            slot.reset();
+            slot.id = i;                                        // assign a slot id number
+            slot.n_ctx = n_ctx_slot;                            // assign a slot cache (but see the lazy note above)
+            slot.reset();                                       // basically just zero everything so pristine state
 
             LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
-            slots.push_back(slot);
+            slots.push_back(slot);                              // store the latest slot in slots
         }
-
-        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+        // llama_batch is defined in llama.h where it says its blocks must have size n_tokens
+        // however this seems to work without memory problems setting blocks to n_ctx_slot
+        // in which case we seem to be able to assign enormous context sizes to the project
+        // but the comment in llama.h is not entirely clear what n_tokens needs to be
+        // only what the components of the batch need to be, ie the same size as n_tokens
+        // and llama_batch_init seems to ensure that this condition is always met
+        // by the way it is initialised, so I am a bit puzzled by what the requirement is
+        batch = llama_batch_init(n_ctx_slot, 0, params.n_parallel);  // context_size; embd false; number of parallel slots
 
         // empty system prompt
         system_prompt = "";
         system_tokens.clear();
     }
 
-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
+    std::vector<llama_token> tokenize(const json &json_prompt, bool add_bos) const
     {
         // TODO: currently, we tokenize using special tokens by default
         //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
@@ -660,7 +751,7 @@ struct llama_server_context
         if (json_prompt.is_array())
         {
             bool first = true;
-            for (const auto& p : json_prompt)
+            for (const auto &p : json_prompt)
             {
                 if (p.is_string())
                 {
@@ -696,60 +787,71 @@ struct llama_server_context
         return prompt_tokens;
     }
 
-    llama_client_slot* get_slot(int id) {
+    // get a new slot by comparing timestamps - is this where it goes wrong? Called from 1655
+    llama_client_slot *get_slot(int id)
+    {
         int64_t t_last = ggml_time_us();
-        llama_client_slot *last_used = nullptr;
+        llama_client_slot *last_used = nullptr; // default return if no slot is available; should we look for first used available?
 
-        for (llama_client_slot & slot : slots)
+        for (llama_client_slot &slot : slots)
         {
-            if (slot.id == id && slot.available())
+            if (slot.id == id && slot.available())  // this seems only to select slot 5 !!!
             {
+                LOG_TEE("Slot %2d chosen (untimed; marked as available)\n", slot.id);
                 return &slot;
             }
 
-            if (slot.available() && slot.t_last_used < t_last)
+            if (slot.available() && slot.t_last_used < t_last)  // seems always to select slot 0 and one other ...
             {
+                LOG_TEE("Slot %2d chosen (timed and finished)\n", slot.id);
                 last_used = &slot;
                 t_last = slot.t_last_used;
+                break;  // we only want one and preferably the first one; this works to that effect
             }
         }
-
+        if (last_used != nullptr) {
+            LOG_TEE("Slot %2d selected as available\n", last_used->id);
+        }
         return last_used;
     }
 
-    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
+    bool launch_slot_with_data(llama_client_slot *&slot, json data)
+    {
         slot_params default_params;
         llama_sampling_params default_sparams;
 
-        if (data.count("__oaicompat") != 0) {
+        if (data.count("__oaicompat") != 0)
+        {
             slot->oaicompat = true;
             slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-        } else {
+        }
+        else
+        {
             slot->oaicompat = false;
             slot->oaicompat_model = "";
         }
 
-        slot->params.stream           = json_value(data, "stream",            false);
-        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
-        slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
-        slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
-        slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
-        slot->sparams.min_p           = json_value(data, "min_p",             default_sparams.min_p);
-        slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
-        slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
-        slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
-        slot->sparams.penalty_repeat  = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
-        slot->sparams.penalty_freq    = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
-        slot->sparams.penalty_present = json_value(data, "presence_penalty",  default_sparams.penalty_present);
-        slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
-        slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
-        slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
-        slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->params.seed             = json_value(data, "seed",              default_params.seed);
-        slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
-        slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);
+        slot->params.stream = json_value(data, "stream", false);
+        slot->params.cache_prompt = json_value(data, "cache_prompt", false);
+        slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
+        slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
+        slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
+        slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
+        slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
+        slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
+        slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
+        slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
+        slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
+        slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+        slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
+        slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
+        slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
+        slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
+        slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
+        slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
+        slot->params.seed = json_value(data, "seed", default_params.seed);
+        slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
+        slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
 
         // infill
         if (data.count("input_prefix") != 0)
@@ -888,7 +990,8 @@ struct llama_server_context
                     std::string prompt = slot->prompt.get<std::string>();
                     size_t pos = 0, begin_prefix = 0;
                     std::string pattern = "[img-";
-                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
+                    while ((pos = prompt.find(pattern, pos)) != std::string::npos)
+                    {
                         size_t end_prefix = pos;
                         pos += pattern.length();
                         size_t end_pos = prompt.find("]", pos);
@@ -901,19 +1004,23 @@ struct llama_server_context
                                 bool found = false;
                                 for (slot_image &img : slot->images)
                                 {
-                                    if (img.id == img_id) {
+                                    if (img.id == img_id)
+                                    {
                                         found = true;
                                         img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
                                         begin_prefix = end_pos + 1;
                                         break;
                                     }
                                 }
-                                if (!found) {
+                                if (!found)
+                                {
                                     LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
                                     slot->images.clear();
                                     return false;
                                 }
-                            } catch (const std::invalid_argument& e) {
+                            }
+                            catch (const std::invalid_argument &e)
+                            {
                                 LOG_TEE("Invalid image number id in prompt\n");
                                 slot->images.clear();
                                 return false;
@@ -937,27 +1044,29 @@ struct llama_server_context
 
         all_slots_are_idle = false;
 
-        LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+        LOG_TEE("Slot %2i is processing [task id: %2i]\n", slot->id, slot->task_id);
 
         return true;
     }
 
-    void kv_cache_clear() {
+    void kv_cache_clear()
+    {
         // clear the entire KV cache
         llama_kv_cache_clear(ctx);
         clean_kv_cache = false;
     }
 
-    void update_system_prompt() {
+    void update_system_prompt()
+    {
         system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
 
         llama_batch_clear(batch);
 
         kv_cache_clear();
 
-        for (int i = 0; i < (int) system_tokens.size(); ++i)
+        for (int i = 0; i < (int)system_tokens.size(); ++i)
         {
-            llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
+            llama_batch_add(batch, system_tokens[i], i, {0}, false);
         }
 
         if (llama_decode(ctx, batch) != 0)
@@ -972,11 +1081,12 @@ struct llama_server_context
             llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
         }
 
-        LOG_TEE("system prompt updated\n");
+        LOG_TEE("system prompt updated when system_tokens_size was %zu\n", system_tokens.size());
         system_need_update = false;
     }
 
-    void notify_system_prompt_changed() {
+    void notify_system_prompt_changed()
+    {
         // release all slots
         for (llama_client_slot &slot : slots)
         {
@@ -986,9 +1096,10 @@ struct llama_server_context
         system_need_update = true;
     }
 
-    void process_system_prompt_data(const json &sys_props) {
-        system_prompt  = sys_props.value("prompt", "");
-        name_user      = sys_props.value("anti_prompt", "");
+    void process_system_prompt_data(const json &sys_props)
+    {
+        system_prompt = sys_props.value("prompt", "");
+        name_user = sys_props.value("anti_prompt", "");
         name_assistant = sys_props.value("assistant_name", "");
 
         if (slots.size() > 0)
@@ -1031,7 +1142,8 @@ struct llama_server_context
         return stop_pos;
     }
 
-    bool process_token(completion_token_output &result, llama_client_slot &slot) {
+    bool process_token(completion_token_output &result, llama_client_slot &slot)
+    {
         // remember which tokens were sampled - used for repetition penalties during sampling
         const std::string token_str = llama_token_to_piece(ctx, result.tok);
         slot.sampled = result.tok;
@@ -1152,8 +1264,8 @@ struct llama_server_context
             {
                 continue;
             }
-            clip_image_f32 * img_res = clip_image_f32_init();
-            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
+            clip_image_f32 *img_res = clip_image_f32_init();
+            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/true))
             {
                 LOG_TEE("Error processing the given image");
                 clip_free(clp_ctx);
@@ -1180,7 +1292,7 @@ struct llama_server_context
         return slot.images.size() > 0;
     }
 
-    void send_error(task_server& task, std::string error)
+    void send_error(task_server &task, std::string error)
     {
         std::unique_lock<std::mutex> lock(mutex_results);
         task_result res;
@@ -1188,12 +1300,12 @@ struct llama_server_context
         res.multitask_id = task.multitask_id;
         res.stop = false;
         res.error = true;
-        res.result_json = { { "content", error } };
+        res.result_json = {{"content", error}};
         queue_results.push_back(res);
         condition_results.notify_all();
     }
 
-    void add_multi_task(int id, std::vector<int>& sub_ids)
+    void add_multi_task(int id, std::vector<int> &sub_ids)
     {
         std::lock_guard<std::mutex> lock(mutex_tasks);
         task_multi multi;
@@ -1203,10 +1315,10 @@ struct llama_server_context
         condition_tasks.notify_one();
     }
 
-    void update_multi_task(int multitask_id, int subtask_id, task_result& result)
+    void update_multi_task(int multitask_id, int subtask_id, task_result &result)
     {
         std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
+        for (auto &multitask : queue_multitasks)
         {
             if (multitask.id == multitask_id)
             {
@@ -1227,34 +1339,34 @@ struct llama_server_context
         const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
                                 eos_bias->second < 0.0f && std::isinf(eos_bias->second);
-        return json {
-            {"n_ctx",             slot.n_ctx},
-            {"model",             params.model_alias},
-            {"seed",              slot.params.seed},
-            {"temperature",       slot.sparams.temp},
-            {"top_k",             slot.sparams.top_k},
-            {"top_p",             slot.sparams.top_p},
-            {"min_p",             slot.sparams.min_p},
-            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typical_p},
-            {"repeat_last_n",     slot.sparams.penalty_last_n},
-            {"repeat_penalty",    slot.sparams.penalty_repeat},
-            {"presence_penalty",  slot.sparams.penalty_present},
+        return json{
+            {"n_ctx", slot.n_ctx},
+            {"model", params.model_alias},
+            {"seed", slot.params.seed},
+            {"temperature", slot.sparams.temp},
+            {"top_k", slot.sparams.top_k},
+            {"top_p", slot.sparams.top_p},
+            {"min_p", slot.sparams.min_p},
+            {"tfs_z", slot.sparams.tfs_z},
+            {"typical_p", slot.sparams.typical_p},
+            {"repeat_last_n", slot.sparams.penalty_last_n},
+            {"repeat_penalty", slot.sparams.penalty_repeat},
+            {"presence_penalty", slot.sparams.penalty_present},
             {"frequency_penalty", slot.sparams.penalty_freq},
             {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
             {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
-            {"mirostat",          slot.sparams.mirostat},
-            {"mirostat_tau",      slot.sparams.mirostat_tau},
-            {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
-            {"stop",              slot.params.antiprompt},
-            {"n_predict",         slot.params.n_predict},
-            {"n_keep",            params.n_keep},
-            {"ignore_eos",        ignore_eos},
-            {"stream",            slot.params.stream},
-            {"logit_bias",        slot.sparams.logit_bias},
-            {"n_probs",           slot.sparams.n_probs},
-            {"grammar",           slot.sparams.grammar},
+            {"mirostat", slot.sparams.mirostat},
+            {"mirostat_tau", slot.sparams.mirostat_tau},
+            {"mirostat_eta", slot.sparams.mirostat_eta},
+            {"penalize_nl", slot.sparams.penalize_nl},
+            {"stop", slot.params.antiprompt},
+            {"n_predict", slot.params.n_predict},
+            {"n_keep", params.n_keep},
+            {"ignore_eos", ignore_eos},
+            {"stream", slot.params.stream},
+            {"logit_bias", slot.sparams.logit_bias},
+            {"n_probs", slot.sparams.n_probs},
+            {"grammar", slot.sparams.grammar},
         };
     }
 
@@ -1267,19 +1379,17 @@ struct llama_server_context
         res.error = false;
         res.stop = false;
 
-        res.result_json = json
-        {
-            {"content",    tkn.text_to_send},
-            {"stop",       false},
-            {"slot_id",    slot.id},
-            {"multimodal", multimodal}
-        };
+        res.result_json = json{
+            {"content", tkn.text_to_send},
+            {"stop", false},
+            {"slot_id", slot.id},
+            {"multimodal", multimodal}};
 
         if (slot.sparams.n_probs > 0)
         {
             std::vector<completion_token_output> probs_output = {};
             const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
-            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
+            size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
             size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
             if (probs_pos < probs_stop_pos)
             {
@@ -1308,24 +1418,22 @@ struct llama_server_context
         res.error = false;
         res.stop = true;
 
-        res.result_json = json
-        {
-            {"content",             !slot.params.stream ? slot.generated_text : ""},
-            {"slot_id",             slot.id},
-            {"stop",                true},
-            {"model",               params.model_alias},
-            {"tokens_predicted",    slot.n_decoded},
-            {"tokens_evaluated",    slot.num_prompt_tokens},
+        res.result_json = json{
+            {"content", !slot.params.stream ? slot.generated_text : ""},
+            {"slot_id", slot.id},
+            {"stop", true},
+            {"model", params.model_alias},
+            {"tokens_predicted", slot.n_decoded},
+            {"tokens_evaluated", slot.num_prompt_tokens},
             {"generation_settings", get_formated_generation(slot)},
-            {"prompt",              slot.prompt},
-            {"truncated",           slot.truncated},
-            {"stopped_eos",         slot.stopped_eos},
-            {"stopped_word",        slot.stopped_word},
-            {"stopped_limit",       slot.stopped_limit},
-            {"stopping_word",       slot.stopping_word},
-            {"tokens_cached",       slot.n_past},
-            {"timings",             slot.get_formated_timings()}
-        };
+            {"prompt", slot.prompt},
+            {"truncated", slot.truncated},
+            {"stopped_eos", slot.stopped_eos},
+            {"stopped_word", slot.stopped_word},
+            {"stopped_limit", slot.stopped_limit},
+            {"stopping_word", slot.stopping_word},
+            {"tokens_cached", slot.n_past},
+            {"timings", slot.get_formated_timings()}};
 
         if (slot.sparams.n_probs > 0)
         {
@@ -1338,8 +1446,8 @@ struct llama_server_context
             else
             {
                 probs = std::vector<completion_token_output>(
-                                    slot.generated_token_probs.begin(),
-                                    slot.generated_token_probs.end());
+                    slot.generated_token_probs.begin(),
+                    slot.generated_token_probs.end());
             }
             res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
         }
@@ -1375,8 +1483,7 @@ struct llama_server_context
             LOG_WARNING("embedding disabled", {
                                                   {"params.embedding", params.embedding},
                                               });
-            res.result_json = json
-            {
+            res.result_json = json{
                 {"embedding", std::vector<float>(n_embd, 0.0f)},
             };
         }
@@ -1384,9 +1491,8 @@ struct llama_server_context
         {
             const float *data = llama_get_embeddings(ctx);
             std::vector<float> embedding(data, data + n_embd);
-            res.result_json = json
-            {
-                {"embedding", embedding },
+            res.result_json = json{
+                {"embedding", embedding},
             };
         }
         queue_results.push_back(res);
@@ -1406,14 +1512,18 @@ struct llama_server_context
         task.multitask_id = multitask_id;
 
         // when a completion task's prompt array is not a singleton, we split it into multiple requests
+        // but this only applies to a single completion with multiple prompts, not to a string of separate prompts
         if (task.data.count("prompt") && task.data.at("prompt").size() > 1)
         {
+            LOG_TEE("Queue size: %3zu; prompt size count is %3zu\n", queue_tasks.size(), task.data.at("prompt").size());
             lock.unlock(); // entering new func scope
             return split_multiprompt_task(task);
         }
 
-        // otherwise, it's a single-prompt task, we actually queue it
+        // otherwise, it's a single-prompt task, so queue it
+        LOG_TEE("Queue size: %2zu before adding task %2d\n", queue_tasks.size(), task.id);
         queue_tasks.push_back(task);
+        LOG_TEE("Queue size: %2zu after  adding task %2d\n", queue_tasks.size(), task.id);
         condition_tasks.notify_one();
         return task.id;
     }
@@ -1422,12 +1532,12 @@ struct llama_server_context
     {
         while (true)
         {
+            // LOG_TEE("Queue results size: %zu / %zu\n", queue_results.size(), queue_results.max_size());
             std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
+            condition_results.wait(lock, [&]
+                                   { return !queue_results.empty(); });
 
-            for (int i = 0; i < (int) queue_results.size(); i++)
+            for (int i = 0; i < (int)queue_results.size(); i++)
             {
                 // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
                 if (queue_results[i].multitask_id == task_id)
@@ -1448,7 +1558,7 @@ struct llama_server_context
         }
 
         // never reached
-        //return task_result{-1, false, false, {}};
+        // return task_result{-1, false, false, {}};
     }
 
     // for multiple images processing
@@ -1456,22 +1566,22 @@ struct llama_server_context
     {
         int image_idx = 0;
 
-        while (image_idx < (int) slot.images.size())
+        while (image_idx < (int)slot.images.size())
         {
             slot_image &img = slot.images[image_idx];
 
             // process prefix prompt
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+            for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch)
             {
-                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+                const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i));
                 llama_batch batch_view = {
                     n_tokens,
-                    batch.token    + i,
+                    batch.token + i,
                     nullptr,
-                    batch.pos      + i,
+                    batch.pos + i,
                     batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
+                    batch.seq_id + i,
+                    batch.logits + i,
                     0, 0, 0, // unused
                 };
                 if (llama_decode(ctx, batch_view))
@@ -1491,7 +1601,18 @@ struct llama_server_context
                 }
 
                 const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
+                llama_batch batch_img = {
+                    n_eval,
+                    nullptr,
+                    (img.image_embedding + i * n_embd),
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    slot.n_past,
+                    1,
+                    0,
+                };
                 if (llama_decode(ctx, batch_img))
                 {
                     LOG_TEE("%s : failed to eval image\n", __func__);
@@ -1504,14 +1625,13 @@ struct llama_server_context
             llama_batch_clear(batch);
 
             // append prefix of next image
-            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
-                slot.params.input_suffix : // no more images, then process suffix prompt
-                (json)(slot.images[image_idx].prefix_prompt);
+            const auto json_prompt = (image_idx >= (int)slot.images.size()) ? slot.params.input_suffix : // no more images, then process suffix prompt
+                                         (json)(slot.images[image_idx].prefix_prompt);
 
             std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
-            for (int i = 0; i < (int) append_tokens.size(); ++i)
+            for (int i = 0; i < (int)append_tokens.size(); ++i)
             {
-                llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
+                llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true);
                 slot.n_past += 1;
             }
         }
@@ -1530,7 +1650,7 @@ struct llama_server_context
         condition_tasks.notify_one();
     }
 
-    int split_multiprompt_task(task_server& multiprompt_task)
+    int split_multiprompt_task(task_server &multiprompt_task)
     {
         int prompt_count = multiprompt_task.data.at("prompt").size();
         assert(prompt_count > 1);
@@ -1556,49 +1676,57 @@ struct llama_server_context
         std::unique_lock<std::mutex> lock(mutex_tasks);
         while (!queue_tasks.empty())
         {
+            // Assign the task at the front of the queue to task and remove it from the queue
             task_server task = queue_tasks.front();
             queue_tasks.erase(queue_tasks.begin());
+            LOG_TEE("Queue size: %zu members\n", queue_tasks.size());
+            // what kind of task have we loaded: completion or cancel?
             switch (task.type)
             {
-                case TASK_TYPE_COMPLETION: {
-                    llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
-                    if (slot == nullptr)
-                    {
-                        LOG_TEE("slot unavailable\n");
-                        // send error result
-                        send_error(task, "slot unavailable");
-                        return;
-                    }
+            case TASK_TYPE_COMPLETION:
+            {
+                llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
+                // get_slot returns nullptr if no slot is available; otherwise a pointer to the chosen slot itself
+                if (slot == nullptr)
+                {
+                    LOG_TEE("no slots are available\n");
+                    // send error result
+                    send_error(task, "slot unavailable");
+                    return;
+                }
 
-                    if (task.data.contains("system_prompt"))
-                    {
-                        process_system_prompt_data(task.data["system_prompt"]);
-                    }
+                if (task.data.contains("system_prompt"))
+                {
+                    process_system_prompt_data(task.data["system_prompt"]);
+                }
 
-                    slot->reset();
+                slot->reset();                              // reset the slot to a known fixed state
 
-                    slot->infill       = task.infill_mode;
-                    slot->embedding    = task.embedding_mode;
-                    slot->task_id      = task.id;
-                    slot->multitask_id = task.multitask_id;
+                slot->infill = task.infill_mode;            // assign the infill parameter
+                slot->embedding = task.embedding_mode;      // assign the embedding parameter
+                slot->task_id = task.id;                    // assign the task id
+                slot->multitask_id = task.multitask_id;     // assign any multitask parameters such as images
 
-                    if (!launch_slot_with_data(slot, task.data))
+                if (!launch_slot_with_data(slot, task.data))
+                {
+                    // send error result
+                    send_error(task, "internal_error");     // if we can't launch the slot with the task data, log an error
+                    break;
+                }
+            }
+            break;
+            case TASK_TYPE_CANCEL:
+            { // release slot linked with the task id
+                for (auto &slot : slots)
+                {
+                    if (slot.task_id == task.target_id)
                     {
-                        // send error result
-                        send_error(task, "internal_error");
-                        break;
+                        slot.release();                     // this sets command to RELEASE but doesn't actually do the release
+                        break;                              // it also collects the time taken to generate the tokens
                     }
-                } break;
-                case TASK_TYPE_CANCEL: { // release slot linked with the task id
-                    for (auto & slot : slots)
-                    {
-                        if (slot.task_id == task.target_id)
-                        {
-                            slot.release();
-                            break;
-                        }
-                    }
-                } break;
+                }
+            }
+            break;
             }
         }
 
@@ -1616,12 +1744,12 @@ struct llama_server_context
 
                 // collect json results into one json result
                 std::vector<json> result_jsons;
-                for (auto& subres : queue_iterator->results)
+                for (auto &subres : queue_iterator->results)
                 {
                     result_jsons.push_back(subres.result_json);
                     aggregate_result.error = aggregate_result.error && subres.error;
                 }
-                aggregate_result.result_json = json{ "results", result_jsons };
+                aggregate_result.result_json = json{"results", result_jsons};
 
                 std::lock_guard<std::mutex> lock(mutex_results);
                 queue_results.push_back(aggregate_result);
@@ -1636,11 +1764,19 @@ struct llama_server_context
         }
     }
 
-    bool update_slots() {
+    bool update_slots()
+    {
+        // Set up a store for the lengths of all the kvcaches with their slot.id values
+        std::vector<std::pair<int, int>> kvcache_store;
+        for (auto &slot : slots) {
+            kvcache_store.push_back({0,slot.id});
+        }
+
         // attend tasks
+
         process_tasks();
 
-        // update the system prompt wait until all slots are idle state
+        // update the system prompt wait until all slots are idle state WHAT DOES THIS ACHIEVE?
         if (system_need_update && all_slots_are_idle)
         {
             LOG_TEE("updating system prompt\n");
@@ -1653,25 +1789,26 @@ struct llama_server_context
         {
             if (system_prompt.empty() && clean_kv_cache)
             {
-                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
+                LOG_TEE("all slots are idle and system prompt is empty; clearing the KV cache\n");
                 kv_cache_clear();
             }
+            LOG_TEE("Queue has %zu / %zu members\n", queue_tasks.size(), queue_tasks.max_size());
             std::unique_lock<std::mutex> lock(mutex_tasks);
-            condition_tasks.wait(lock, [&]{
-                return !queue_tasks.empty();
-            });
+            condition_tasks.wait(lock, [&]
+                                 { return !queue_tasks.empty(); });
         }
 
+        // this is where we shift the context once the kvcache is full
         for (llama_client_slot &slot : slots)
         {
-            if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
+            if (slot.is_processing() && (slot.cache_tokens.size() >= (size_t)slot.n_ctx))
             {
-                // Shift context
-                const int n_left    = slot.n_past - slot.params.n_keep - 1;
-                const int n_discard = n_left / 2;
+                // Shift context (-1 because the default for n_keep is -1)
+                const int n_left = slot.n_past - slot.params.n_keep - 1;
+                const int n_discard = rand() % (n_left/2);   // discard up to half the processed tokens and (below) shift to start of cache
 
                 LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
-                llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, slot.params.n_keep + n_discard + 1);
                 llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
 
                 for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
@@ -1679,6 +1816,14 @@ struct llama_server_context
                     slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
                 }
 
+                // TODO: experimental graphic display - needs to be made a function instead of repeatedly being called
+                // TODO: since we are now passing the whole 'slot' we don't need a vector-pair but leave it for now
+                std::vector<std::pair<int,llama_client_slot>> kvcache_store;
+                for (auto &slot1 : slots) {
+                    kvcache_store.push_back({slot1.cache_tokens.size(),slot1});
+                }
+                show_kvcache(kvcache_store, params.n_ctx);
+
                 slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
 
                 slot.n_past -= n_discard;
@@ -1686,25 +1831,39 @@ struct llama_server_context
                 slot.truncated = true;
 
                 LOG_VERBOSE("context shift", {
-                                                {"n_ctx",  n_ctx},
-                                                {"n_keep", params.n_keep},
-                                                {"n_left", n_left},
-                                            });
+                                                 {"n_ctx", n_ctx},
+                                                 {"n_keep", params.n_keep},
+                                                 {"n_left", n_left},
+                                             });
             }
         }
 
         // decode any currently ongoing sequences
-        for (auto & slot : slots)
+        // FIRST deal with the cases where the slot command state is RELEASE or IDLE; otherwise it is PROCESSING
+        // SECOND check whether the slot has no PROMPT or is INFILL (which needn't have a prompt)
+        // OTHERWISE process the slots that have state IDLE and have a PROMPT
+        for (auto &slot : slots)
         {
-            // release the slot
+            //show_kvcache(std::vector<std::pair<int, struct llama_client_slot>>{
+            //    {int(slot.cache_tokens.size()), slot}}, slot.n_ctx);
+
+            // release the slot (remember that slot if an instance of "struct llama_client_slot")
+            // it's not clear what releasing does if it doesn't occasion reuse of the slot ...
+            // the logic here isn't clear because we are singling out RELEASE and IDLE of 6 combinations
             if (slot.command == RELEASE)
             {
                 slot.state = IDLE;
                 slot.command = NONE;
-                slot.t_last_used = ggml_time_us();
+                slot.t_last_used = ggml_time_us();  // used to reassign slots once finished but does it?
 
-                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                LOG_TEE("slot %d released (%d tokens remain in cache)\n", slot.id, (int)slot.cache_tokens.size());
 
+                // we need to make the next five lines into a called function eventually
+                std::vector<std::pair<int,llama_client_slot>> kvcache_store;
+                for (auto &slot1 : slots) {
+                    kvcache_store.push_back({slot1.cache_tokens.size(),slot1});
+                }
+                show_kvcache(kvcache_store, params.n_ctx);
                 continue;
             }
 
@@ -1713,9 +1872,10 @@ struct llama_server_context
                 continue;
             }
 
+            // if we get here slot.command = NONE or LOAD_PROMPT; slot.state = PROCESSING
             slot.i_batch = batch.n_tokens;
 
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);
+            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, {slot.id}, true);
 
             slot.n_past += 1;
         }
@@ -1726,7 +1886,7 @@ struct llama_server_context
         // assign workload to the slots
         if (params.cont_batching || batch.n_tokens == 0)
         {
-            for (auto & slot : slots)
+            for (auto &slot : slots)    // better not to reuse the same variable name 'slot' here
             {
                 const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
 
@@ -1740,7 +1900,7 @@ struct llama_server_context
                     continue;
                 }
 
-                // need process the prompt
+                // if we get here, we need to process the prompt
                 if (slot.state == IDLE && slot.command == LOAD_PROMPT)
                 {
                     slot.state = PROCESSING;
@@ -1749,6 +1909,7 @@ struct llama_server_context
                     slot.t_start_process_prompt = ggml_time_us();
                     slot.t_start_genereration = 0;
 
+                    // deal with infill
                     if (slot.infill)
                     {
                         bool suff_rm_leading_spc = true;
@@ -1760,8 +1921,12 @@ struct llama_server_context
                         auto prefix_tokens = tokenize(slot.params.input_prefix, false);
                         auto suffix_tokens = tokenize(slot.params.input_suffix, false);
 
+                        // what does the next line do? why is it here and why not hardcoded?
+                        // it appears that this is a hard-coded TOKEN for the space in infill suffix
+                        // basically we're just removing spaces from suffices
                         const int space_token = 29871; // TODO: this should not be hardcoded
-                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
+                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token)
+                        {
                             suffix_tokens.erase(suffix_tokens.begin());
                         }
 
@@ -1774,7 +1939,7 @@ struct llama_server_context
                     }
                     else
                     {
-                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
+                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
                     }
 
                     slot.num_prompt_tokens = prompt_tokens.size();
@@ -1796,11 +1961,11 @@ struct llama_server_context
                         new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
 
                         LOG_VERBOSE("input truncated", {
-                            {"n_ctx",  slot.n_ctx},
-                            {"n_keep", slot.params.n_keep},
-                            {"n_left", n_left},
-                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                        });
+                                                           {"n_ctx", slot.n_ctx},
+                                                           {"n_keep", slot.params.n_keep},
+                                                           {"n_left", n_left},
+                                                           {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                                                       });
                         slot.truncated = true;
                         prompt_tokens = new_tokens;
 
@@ -1826,10 +1991,10 @@ struct llama_server_context
                         slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
                         slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
 
-                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+                        LOG_TEE("slot %d : in cache: %i tokens | processed: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                     }
 
-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+                    LOG_TEE("Slot %2d : kv cache rm - [%d, end)\n", slot.id, (int)system_tokens.size() + slot.n_past);
 
                     llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
 
@@ -1843,18 +2008,18 @@ struct llama_server_context
                     }
 
                     LOG_VERBOSE("prompt ingested", {
-                                                    {"n_past", slot.n_past},
-                                                    {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
-                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
-                                                });
+                                                       {"n_past", slot.n_past},
+                                                       {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
+                                                       {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
+                                                   });
 
                     const bool has_images = process_images(slot);
 
                     // process the prefix of first image
                     std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
-                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
+                    for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past)
                     {
-                       llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
+                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, {slot.id}, false);
                     }
 
                     if (has_images && !ingest_images(slot, n_batch))
@@ -1870,7 +2035,16 @@ struct llama_server_context
                     }
 
                     slot.n_decoded = 0;
-                    slot.i_batch   = batch.n_tokens - 1;
+                    slot.i_batch = batch.n_tokens - 1;
+
+                    // This draws the graphic for all the slots at once; comment out to silence until control is done
+
+                    std::vector<std::pair<int,llama_client_slot>> kvcache_store;
+                        for (auto &slot1 : slots) {
+                            kvcache_store.push_back({slot1.cache_tokens.size(),slot1});
+                    }
+                    show_kvcache(kvcache_store, params.n_ctx);
+
                 }
             }
         }
@@ -1881,20 +2055,20 @@ struct llama_server_context
             return true;
         }
 
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+        for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch)
         {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i));
             llama_batch batch_view =
-            {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
+                {
+                    n_tokens,
+                    batch.token + i,
+                    nullptr,
+                    batch.pos + i,
+                    batch.n_seq_id + i,
+                    batch.seq_id + i,
+                    batch.logits + i,
+                    0, 0, 0, // unused because of the three entries at the end of llama_batch
+                };
 
             const int ret = llama_decode(ctx, batch_view);
             if (ret != 0)
@@ -1906,7 +2080,14 @@ struct llama_server_context
                     return false;
                 }
 
-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG_TEE("%s : no contiguous space size %d in the KV cache; trying with n_batch = %d\n", __func__, n_batch, n_batch / 2);
+                // Experimental display of KV cache usage
+
+                std::vector<std::pair<int,llama_client_slot>> kvcache_store;
+                for (auto &slot1 : slots) {
+                    kvcache_store.push_back({slot1.cache_tokens.size(),slot1});
+                }
+                show_kvcache(kvcache_store, params.n_ctx);
 
                 // retry with half the batch size to try to find a free slot in the KV cache
                 n_batch /= 2;
@@ -1914,9 +2095,9 @@ struct llama_server_context
                 continue;
             }
 
-            for (auto & slot : slots)
+            for (auto &slot : slots)
             {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
+                if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens))
                 {
                     continue;
                 }
@@ -1942,7 +2123,7 @@ struct llama_server_context
                     slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
                 }
 
-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
+                llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false};
                 result.tok = id;
 
                 const int32_t n_probs = slot.sparams.n_probs;
@@ -2027,10 +2208,11 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
     printf("  -np N, --parallel N   number of slots for process requests (default: %d)\n", params.n_parallel);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
-    printf("    -spf FNAME, --system-prompt-file FNAME\n");
+    printf("  -spf FNAME, --system-prompt-file FNAME\n");
     printf("                        Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA.\n");
     printf("  --log-disable         disables logging to a file.\n");
+    printf("  --log-file FNAME      Specify a log file name to override the default llama.log");
     printf("\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
@@ -2039,7 +2221,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
 }
 
 static void server_params_parse(int argc, char **argv, server_params &sparams,
-                                gpt_params &params, llama_server_context& llama)
+                                gpt_params &params, llama_server_context &llama)
 {
     gpt_params default_params;
     server_params default_sparams;
@@ -2093,16 +2275,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             std::ifstream key_file(argv[i]);
-            if (!key_file) {
+            if (!key_file)
+            {
                 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                 invalid_param = true;
                 break;
             }
             std::string key;
-            while (std::getline(key_file, key)) {
-               if (key.size() > 0) {
-                   sparams.api_keys.push_back(key);
-               }
+            while (std::getline(key_file, key))
+            {
+                if (key.size() > 0)
+                {
+                    sparams.api_keys.push_back(key);
+                }
             }
             key_file.close();
         }
@@ -2156,10 +2341,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
-            else { invalid_param = true; break; }
+            /**/ if (value == "none")
+            {
+                params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE;
+            }
+            else if (value == "linear")
+            {
+                params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR;
+            }
+            else if (value == "yarn")
+            {
+                params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN;
+            }
+            else
+            {
+                invalid_param = true;
+                break;
+            }
         }
         else if (arg == "--rope-freq-base")
         {
@@ -2181,7 +2379,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
         else if (arg == "--yarn-ext-factor")
         {
-            if (++i >= argc) {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
@@ -2189,7 +2388,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
         else if (arg == "--yarn-attn-factor")
         {
-            if (++i >= argc) {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
@@ -2197,7 +2397,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
         else if (arg == "--yarn-beta-fast")
         {
-            if (++i >= argc) {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
@@ -2205,7 +2406,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
         else if (arg == "--yarn-beta-slow")
         {
-            if (++i >= argc) {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
@@ -2323,7 +2525,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-            const char * lora_adapter = argv[i];
+            const char *lora_adapter = argv[i];
             if (++i >= argc)
             {
                 invalid_param = true;
@@ -2377,7 +2579,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--n-predict")
+        }
+        else if (arg == "-n" || arg == "--n-predict")
         {
             if (++i >= argc)
             {
@@ -2385,7 +2588,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "-spf" || arg == "--system-prompt-file")
+        }
+        else if (arg == "-spf" || arg == "--system-prompt-file")
         {
             if (++i >= argc)
             {
@@ -2393,7 +2597,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             std::ifstream file(argv[i]);
-            if (!file) {
+            if (!file)
+            {
                 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                 invalid_param = true;
                 break;
@@ -2402,11 +2607,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             std::copy(
                 std::istreambuf_iterator<char>(file),
                 std::istreambuf_iterator<char>(),
-                std::back_inserter(systm_content)
-            );
+                std::back_inserter(systm_content));
             llama.process_system_prompt_data(json::parse(systm_content));
         }
-        else if(arg == "--mmproj")
+        else if (arg == "--mmproj")
         {
             if (++i >= argc)
             {
@@ -2420,14 +2624,26 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             log_set_target(stdout);
             LOG_INFO("logging to file is disabled.", {});
         }
+        // abortive attempt to change the log file
+        else if (arg == "--log-file")
+        {
+            if (++i >= argc)
+            {
+                log_set_target(argv[i]);
+                LOG_TEE("logging to custom filename %s.", argv[i]);
+            }
+        }
+
         else if (arg == "--override-kv")
         {
-            if (++i >= argc) {
+            if (++i >= argc)
+            {
                 invalid_param = true;
                 break;
             }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
+            char *sep = strchr(argv[i], '=');
+            if (sep == nullptr || sep - argv[i] >= 128)
+            {
                 fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
                 invalid_param = true;
                 break;
@@ -2436,27 +2652,39 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             std::strncpy(kvo.key, argv[i], sep - argv[i]);
             kvo.key[sep - argv[i]] = 0;
             sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
+            if (strncmp(sep, "int:", 4) == 0)
+            {
                 sep += 4;
                 kvo.tag = LLAMA_KV_OVERRIDE_INT;
                 kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
+            }
+            else if (strncmp(sep, "float:", 6) == 0)
+            {
                 sep += 6;
                 kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
                 kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
+            }
+            else if (strncmp(sep, "bool:", 5) == 0)
+            {
                 sep += 5;
                 kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
+                if (std::strcmp(sep, "true") == 0)
+                {
                     kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
+                }
+                else if (std::strcmp(sep, "false") == 0)
+                {
                     kvo.bool_value = false;
-                } else {
+                }
+                else
+                {
                     fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
                     invalid_param = true;
                     break;
                 }
-            } else {
+            }
+            else
+            {
                 fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                 invalid_param = true;
                 break;
@@ -2470,7 +2698,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             exit(1);
         }
     }
-    if (!params.kv_overrides.empty()) {
+    if (!params.kv_overrides.empty())
+    {
         params.kv_overrides.emplace_back(llama_model_kv_override());
         params.kv_overrides.back().key[0] = 0;
     }
@@ -2492,7 +2721,8 @@ static std::string random_string()
 
     std::string result(32, ' ');
 
-    for (int i = 0; i < 32; ++i) {
+    for (int i = 0; i < 32; ++i)
+    {
         result[i] = str[generator() % str.size()];
     }
 
@@ -2510,9 +2740,10 @@ std::string format_chatml(std::vector<json> messages)
 {
     std::ostringstream chatml_msgs;
 
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
+    for (auto it = messages.begin(); it != messages.end(); ++it)
+    {
         chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role",    std::string("user")) << '\n';
+                    << json_value(*it, "role", std::string("user")) << '\n';
         chatml_msgs << json_value(*it, "content", std::string(""))
                     << "<|im_end|>\n";
     }
@@ -2538,35 +2769,39 @@ json oaicompat_completion_params_parse(
     //
     // https://platform.openai.com/docs/api-reference/chat/create
     llama_sampling_params default_sparams;
-    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
-    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
-    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
-    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
-    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
-    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
+    llama_params["model"] = json_value(body, "model", std::string("unknown"));
+    llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
+    llama_params["temperature"] = json_value(body, "temperature", 0.0);
+    llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
+    llama_params["top_p"] = json_value(body, "top_p", 1.0);
+    llama_params["n_predict"] = json_value(body, "max_tokens", -1);
+    llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
     llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
-    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
-    llama_params["stream"]            = json_value(body, "stream", false);
-    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
+    llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
+    llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
+    llama_params["stream"] = json_value(body, "stream", false);
+    llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
+    llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
+    llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
+    llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
+    llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
+    llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
+    llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
+    llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
 
-    if (body.count("grammar") != 0) {
+    if (body.count("grammar") != 0)
+    {
         llama_params["grammar"] = json_value(body, "grammar", json::object());
     }
 
     // Handle 'stop' field
-    if (body.contains("stop") && body["stop"].is_string()) {
+    if (body.contains("stop") && body["stop"].is_string())
+    {
         llama_params["stop"] = json::array({body["stop"].get<std::string>()});
-    } else {
+    }
+    else
+    {
         llama_params["stop"] = json_value(body, "stop", json::array());
     }
 
@@ -2580,45 +2815,48 @@ static json format_final_response_oaicompat(const json &request, const task_resu
 {
     json result = response.result_json;
 
-    bool stopped_word        = result.count("stopped_word") != 0;
-    bool stopped_eos         = json_value(result, "stopped_eos", false);
+    bool stopped_word = result.count("stopped_word") != 0;
+    bool stopped_eos = json_value(result, "stopped_eos", false);
     int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-    std::string content      = json_value(result, "content", std::string(""));
+    int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
+    std::string content = json_value(result, "content", std::string(""));
 
     std::string finish_reason = "length";
-    if (stopped_word || stopped_eos) {
+    if (stopped_word || stopped_eos)
+    {
         finish_reason = "stop";
     }
 
     json choices =
         streaming ? json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}})
+                                      {"index", 0},
+                                      {"delta", json::object()}}})
                   : json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"message", json{{"content", content},
-                                                         {"role", "assistant"}}}}});
+                                      {"index", 0},
+                                      {"message", json{{"content", content},
+                                                       {"role", "assistant"}}}}});
 
     std::time_t t = std::time(0);
 
     json res =
         json{{"choices", choices},
-            {"created", t},
-            {"model",
-                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-            {"usage",
-                json{{"completion_tokens", num_tokens_predicted},
-                     {"prompt_tokens",     num_prompt_tokens},
-                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
-            {"id", gen_chatcmplid()}};
+             {"created", t},
+             {"model",
+              json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+             {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+             {"usage",
+              json{{"completion_tokens", num_tokens_predicted},
+                   {"prompt_tokens", num_prompt_tokens},
+                   {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
+             {"id", gen_chatcmplid()}};
 
-    if (server_verbose) {
+    if (server_verbose)
+    {
         res["__verbose"] = result;
     }
 
-    if (result.contains("completion_probabilities")) {
+    if (result.contains("completion_probabilities"))
+    {
         res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
     }
 
@@ -2626,26 +2864,30 @@ static json format_final_response_oaicompat(const json &request, const task_resu
 }
 
 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
+static std::vector<json> format_partial_response_oaicompat(const task_result &response)
+{
     json result = response.result_json;
 
-    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+    if (!result.contains("model") || !result.contains("oaicompat_token_ctr"))
+    {
         return std::vector<json>({response.result_json});
     }
 
     bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
     std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
 
-    bool stopped_word   = json_value(result, "stopped_word", false);
-    bool stopped_eos    = json_value(result, "stopped_eos", false);
-    bool stopped_limit  = json_value(result, "stopped_limit", false);
+    bool stopped_word = json_value(result, "stopped_word", false);
+    bool stopped_eos = json_value(result, "stopped_eos", false);
+    bool stopped_limit = json_value(result, "stopped_limit", false);
     std::string content = json_value(result, "content", std::string(""));
 
     std::string finish_reason;
-    if (stopped_word || stopped_eos) {
+    if (stopped_word || stopped_eos)
+    {
         finish_reason = "stop";
     }
-    if (stopped_limit) {
+    if (stopped_limit)
+    {
         finish_reason = "length";
     }
 
@@ -2653,46 +2895,54 @@ static std::vector<json> format_partial_response_oaicompat(const task_result &re
 
     json choices;
 
-    if (!finish_reason.empty()) {
+    if (!finish_reason.empty())
+    {
         choices = json::array({json{{"finish_reason", finish_reason},
                                     {"index", 0},
                                     {"delta", json::object()}}});
-    } else {
-        if (first) {
-            if (content.empty()) {
+    }
+    else
+    {
+        if (first)
+        {
+            if (content.empty())
+            {
                 choices = json::array({json{{"finish_reason", nullptr},
                                             {"index", 0},
                                             {"delta", json{{"role", "assistant"}}}}});
-            } else {
+            }
+            else
+            {
                 // We have to send this as two updates to conform to openai behavior
                 json initial_ret = json{{"choices", json::array({json{
-                                        {"finish_reason", nullptr},
-                                        {"index", 0},
-                                        {"delta", json{
-                                            {"role", "assistant"}
-                                        }}}})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
+                                                        {"finish_reason", nullptr},
+                                                        {"index", 0},
+                                                        {"delta", json{
+                                                                      {"role", "assistant"}}}}})},
+                                        {"created", t},
+                                        {"id", gen_chatcmplid()},
+                                        {"model", modelname},
+                                        {"object", "chat.completion.chunk"}};
 
                 json second_ret = json{
-                            {"choices", json::array({json{{"finish_reason", nullptr},
-                                                            {"index", 0},
-                                                            {"delta", json{
-                                                            {"content", content}}}
-                                                            }})},
-                            {"created", t},
-                            {"id", gen_chatcmplid()},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
+                    {"choices", json::array({json{{"finish_reason", nullptr},
+                                                  {"index", 0},
+                                                  {"delta", json{
+                                                                {"content", content}}}}})},
+                    {"created", t},
+                    {"id", gen_chatcmplid()},
+                    {"model", modelname},
+                    {"object", "chat.completion.chunk"}};
 
                 return std::vector<json>({initial_ret, second_ret});
             }
-        } else {
+        }
+        else
+        {
             // Some idiosyncrasy in task processing logic makes several trailing calls
             // with empty content, we ignore these at the calee site.
-            if (content.empty()) {
+            if (content.empty())
+            {
                 return std::vector<json>({json::object()});
             }
 
@@ -2700,9 +2950,9 @@ static std::vector<json> format_partial_response_oaicompat(const task_result &re
                 {"finish_reason", nullptr},
                 {"index", 0},
                 {"delta",
-                json{
-                    {"content", content},
-                }},
+                 json{
+                     {"content", content},
+                 }},
             }});
         }
     }
@@ -2717,15 +2967,13 @@ static std::vector<json> format_partial_response_oaicompat(const task_result &re
 }
 
 static json format_partial_response(
-    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
-) {
-    json res = json
-    {
-        {"content",    content },
-        {"stop",       false},
-        {"slot_id",    slot->id },
-        {"multimodal", llama.multimodal }
-    };
+    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs)
+{
+    json res = json{
+        {"content", content},
+        {"stop", false},
+        {"slot_id", slot->id},
+        {"multimodal", llama.multimodal}};
 
     if (slot->sparams.n_probs > 0)
     {
@@ -2747,7 +2995,6 @@ static json format_detokenized_response(std::string content)
         {"content", content}};
 }
 
-
 static void log_server_request(const httplib::Request &req, const httplib::Response &res)
 {
     LOG_INFO("request", {
@@ -2767,33 +3014,43 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
 
 struct token_translator
 {
-    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
+    llama_context *ctx;
+    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
     std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 
 static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
 {
-    auto & gtps = slot->generated_token_probs;
+    auto &gtps = slot->generated_token_probs;
     auto translator = token_translator{llama.ctx};
-    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
+    auto add_strlen = [=](size_t sum, const completion_token_output &cto)
+    { return sum + translator(cto).size(); };
     const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
     if (slot->generated_text.capacity() < slot->generated_text.size() + len)
     {
         slot->generated_text.reserve(slot->generated_text.size() + len);
     }
-    for (const completion_token_output & cto : gtps)
+    for (const completion_token_output &cto : gtps)
     {
         slot->generated_text += translator(cto);
     }
 }
 
+thread_local std::thread::id this_thread_id;
+
+static void thread_function() {
+
+  this_thread_id = std::this_thread::get_id();
+  printf("Thread ID: %d\n", this_thread_id);
+}
+
+// the main server sequence begins here
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
     log_disable();
 #endif
-    // own arguments required by this example
+    // own arguments required by this example which are those where we add new parameters if required
     gpt_params params;
     server_params sparams;
 
@@ -2819,21 +3076,23 @@ int main(int argc, char **argv)
                                 {"system_info", llama_print_system_info()},
                             });
 
-    httplib::Server svr;
+    httplib::Server svr;    // Server is a class defined inside httplib.h
 
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
 
-    svr.set_default_headers({{"Server", "llama.cpp"}});
+    svr.set_default_headers({{"Server", "PJllama.cpp"}});
 
-    // CORS preflight
-    svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
+    // CORS preflight (Cross-Origin Resource Sharing)
+    svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res)
+                {
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
         res.set_header("Access-Control-Allow-Credentials", "true");
         res.set_header("Access-Control-Allow-Methods", "POST");
-        res.set_header("Access-Control-Allow-Headers", "*");
-    });
+        res.set_header("Access-Control-Allow-Headers", "*"); });
 
-    svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
+    // call this using http://localhost:8080/health or whatever
+    svr.Get("/health", [&](const httplib::Request &, httplib::Response &res)
+            {
         server_state current_state = state.load();
         switch(current_state) {
             case SERVER_STATE_READY:
@@ -2848,13 +3107,13 @@ int main(int argc, char **argv)
                 res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
                 res.status = 500; // HTTP Internal Server Error
                 break;
-        }
-    });
+        } });
 
     svr.set_logger(log_server_request);
 
+    // deal with various kinds of server response that indicate failure
     svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
-            {
+                              {
                 const char fmt[] = "500 Internal Server Error\n%s";
                 char buf[BUFSIZ];
                 try
@@ -2870,11 +3129,10 @@ int main(int argc, char **argv)
                     snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
                 }
                 res.set_content(buf, "text/plain; charset=utf-8");
-                res.status = 500;
-            });
+                res.status = 500; });
 
     svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
-            {
+                          {
                 if (res.status == 401)
                 {
                     res.set_content("Unauthorized", "text/plain; charset=utf-8");
@@ -2887,11 +3145,10 @@ int main(int argc, char **argv)
                 {
                     res.set_content("File Not Found", "text/plain; charset=utf-8");
                     res.status = 404;
-                }
-            });
+                } });
 
     // set timeouts and change hostname and port
-    svr.set_read_timeout (sparams.read_timeout);
+    svr.set_read_timeout(sparams.read_timeout);
     svr.set_write_timeout(sparams.write_timeout);
 
     if (!svr.bind_to_port(sparams.hostname, sparams.port))
@@ -2910,49 +3167,57 @@ int main(int argc, char **argv)
     log_data["hostname"] = sparams.hostname;
     log_data["port"] = std::to_string(sparams.port);
 
-    if (sparams.api_keys.size() == 1) {
+    if (sparams.api_keys.size() == 1)
+    {
         log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
-    } else if (sparams.api_keys.size() > 1) {
+    }
+    else if (sparams.api_keys.size() > 1)
+    {
         log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
     }
 
     LOG_INFO("HTTP server listening", log_data);
     // run the HTTP server in a thread - see comment below
     std::thread t([&]()
-            {
+                  {
                 if (!svr.listen_after_bind())
                 {
                     state.store(SERVER_STATE_ERROR);
                     return 1;
                 }
 
-                return 0;
-            });
+                return 0; });
 
     // load the model
     if (!llama.load_model(params))
     {
         state.store(SERVER_STATE_ERROR);
         return 1;
-    } else {
+    }
+    else
+    {
         llama.initialize();
         state.store(SERVER_STATE_READY);
         LOG_INFO("model loaded", {});
     }
 
     // Middleware for API key validation
-    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
+    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool
+    {
         // If API key is not set, skip validation
-        if (sparams.api_keys.empty()) {
+        if (sparams.api_keys.empty())
+        {
             return true;
         }
 
         // Check for API key in the header
         auto auth_header = req.get_header_value("Authorization");
         std::string prefix = "Bearer ";
-        if (auth_header.substr(0, prefix.size()) == prefix) {
+        if (auth_header.substr(0, prefix.size()) == prefix)
+        {
             std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
+            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end())
+            {
                 return true; // API key is valid
             }
         }
@@ -2970,48 +3235,46 @@ int main(int argc, char **argv)
     svr.Get("/", [](const httplib::Request &, httplib::Response &res)
             {
                 res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
-                return false;
-            });
+                return false; });
 
     // this is only called if no index.js is found in the public --path
     svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
             {
                 res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
-                return false;
-            });
+                return false; });
 
     // this is only called if no index.html is found in the public --path
     svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
             {
                 res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
-                return false;
-            });
+                return false; });
 
     // this is only called if no index.html is found in the public --path
     svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
             {
                 res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
-                return false;
-            });
+                return false; });
 
-    svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
+    svr.Get("/props", [&llama](const httplib::Request &req, httplib::Response &res)
             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 json data = {
                     { "user_name",      llama.name_user.c_str() },
                     { "assistant_name", llama.name_assistant.c_str() }
                 };
-                res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
+                res.set_content(data.dump(), "application/json; charset=utf-8"); });
 
+    // this is the standard programmatic completion request
     svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
-            {
+             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 if (!validate_api_key(req, res)) {
                     return;
                 }
-                json data = json::parse(req.body);
+                json data = json::parse(req.body);      // accesses json.hpp to parse the request body
                 const int task_id = llama.request_completion(data, false, false, -1);
+                LOG_TEE("Task ID: %d;\nRequest body: %s\n", task_id, req.body.c_str());
+                thread_function();
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
                     task_result result = llama.next_result(task_id);
@@ -3071,10 +3334,9 @@ int main(int argc, char **argv)
                     };
 
                     res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
+                } });
 
-    svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
+    svr.Get("/v1/models", [&params](const httplib::Request &req, httplib::Response &res)
             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 std::time_t t = std::time(0);
@@ -3091,18 +3353,16 @@ int main(int argc, char **argv)
                     }}
                 };
 
-                res.set_content(models.dump(), "application/json; charset=utf-8");
-            });
-
+                res.set_content(models.dump(), "application/json; charset=utf-8"); });
 
     // TODO: add mount point without "/v1" prefix -- how?
     svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
-            {
+             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 if (!validate_api_key(req, res)) {
                     return;
                 }
-                json data = oaicompat_completion_params_parse(json::parse(req.body));
+                json data = oaicompat_completion_params_parse(json::parse(req.body));      // accesses json.hpp to parse the request body
 
                 const int task_id = llama.request_completion(data, false, false, -1);
 
@@ -3167,16 +3427,15 @@ int main(int argc, char **argv)
                     };
 
                     res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
+                } });
 
     svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
-            {
+             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 if (!validate_api_key(req, res)) {
                     return;
                 }
-                json data = json::parse(req.body);
+                json data = json::parse(req.body);      // accesses json.hpp to parse the request body
                 const int task_id = llama.request_completion(data, true, false, -1);
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
@@ -3231,20 +3490,18 @@ int main(int argc, char **argv)
                     };
 
                     res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-                }
-            });
+                } });
 
     svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
             {
                 const json data = llama.get_model_props();
-                return res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
+                return res.set_content(data.dump(), "application/json; charset=utf-8"); });
 
     svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
                 { return res.set_content("", "application/json; charset=utf-8"); });
 
     svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
+             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 const json body = json::parse(req.body);
                 std::vector<llama_token> tokens;
@@ -3253,11 +3510,10 @@ int main(int argc, char **argv)
                     tokens = llama.tokenize(body["content"], false);
                 }
                 const json data = format_tokenizer_response(tokens);
-                return res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
+                return res.set_content(data.dump(), "application/json; charset=utf-8"); });
 
     svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
+             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 const json body = json::parse(req.body);
                 std::string content;
@@ -3268,11 +3524,10 @@ int main(int argc, char **argv)
                 }
 
                 const json data = format_detokenized_response(content);
-                return res.set_content(data.dump(), "application/json; charset=utf-8");
-            });
+                return res.set_content(data.dump(), "application/json; charset=utf-8"); });
 
     svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
-            {
+             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 const json body = json::parse(req.body);
                 json prompt;
@@ -3296,23 +3551,35 @@ int main(int argc, char **argv)
 
                 const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
                 task_result result = llama.next_result(task_id);
-                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
-            });
+                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); });
 
-    // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
+    // GG: if I put the main loop inside a thread, it crashes on the first request when built in Debug!?
     //     "Bus error: 10" - this is on macOS, it does not crash on Linux
-    //std::thread t2([&]()
+    std::thread t2( // originally commented out, as was line 3542
+        [&]()
     {
         bool running = true;
-        while (running)
+        while (running)         // update_slots returns true when successful
         {
             running = llama.update_slots();
         }
     }
-    //);
+    ); // get a SIGABORT error on exception here as GG says above when in Debug but not in Release
 
-    t.join();
+    t2.join();      // was originally t.join() despite t2 in line 3533 above
 
     llama_backend_free();
     return 0;
 }
+
+/*
+This illustrates what the threading lambda above does and explains the call by reference it performs while setting threading going
+If we dont use threading we force the whole thing to be sequential
+
+int x = 5;
+auto lambda = [&] {
+  x += 10; // Modifies the original x
+  return x;
+};
+int result = lambda(); // result will be 15 (x updated to 15)
+*/