From 7f0d8987ebeea309714155a04a80c4d36c51466f Mon Sep 17 00:00:00 2001
From: pudepiedj <pudepiedj@gmail.com>
Date: Mon, 19 Feb 2024 12:14:23 +0000
Subject: [PATCH] minor updates and TCPshellscript

---
 Llamaserver.py                           |  9 +++--
 examples/CMakeLists.txt                  |  1 +
 examples/cmap-example/CMakeLists.txt     |  4 +--
 examples/cmap-example/TCPshellscript.cpp | 46 ++++++++++++++++++++++++
 examples/server/server.cpp               | 17 ++++++---
 examples/server/utils.hpp                | 16 +++++----
 6 files changed, 77 insertions(+), 16 deletions(-)
 create mode 100644 examples/cmap-example/TCPshellscript.cpp
diff --git a/Llamaserver.py b/Llamaserver.py
index 2dba932f4..dd5dba287 100644
--- a/Llamaserver.py
+++ b/Llamaserver.py
@@ -9,11 +9,14 @@ def print_dict(data):
         for k, v in data.items():
             if isinstance(v, dict):
                 print_dict(v)
-    elif isinstance(v, list):
+            elif k == "content":          
+                print(f"Key: {k:>30}: {v}")
+                return
+    elif isinstance(data, list):
         for entry in v:
             print_dict(entry)
-    elif k == "content":          
-        print(f"Key: {k:>30}: {v}")
+    elif isinstance(data, str):
+        print(f"Incoming string is {data}.\n")
     return
 
 def print_response(text):
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 653abc73a..7fb2e1e92 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -17,6 +17,7 @@ else()
     add_subdirectory(batched-bench)
     add_subdirectory(beam-search)
     add_subdirectory(benchmark)
+    add_subdirectory(cmap-example)
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(embedding)
     add_subdirectory(finetune)
diff --git a/examples/cmap-example/CMakeLists.txt b/examples/cmap-example/CMakeLists.txt
index 6298b2c7e..d62ca26cf 100644
--- a/examples/cmap-example/CMakeLists.txt
+++ b/examples/cmap-example/CMakeLists.txt
@@ -1,6 +1,6 @@
-set(TARGET kvcacheviz)
+set(TARGET TCPshellscript)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} kvcacheviz.cpp)
+add_executable(${TARGET} TCPshellscript.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
diff --git a/examples/cmap-example/TCPshellscript.cpp b/examples/cmap-example/TCPshellscript.cpp
new file mode 100644
index 000000000..12b46ec9b
--- /dev/null
+++ b/examples/cmap-example/TCPshellscript.cpp
@@ -0,0 +1,46 @@
+// Code to run the terminal shell command `lsof -i :8080' from C++
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <sstream>
+
+using namespace std;
+
+static string get_port_usage(int port) {
+  // Build the command string
+  string command = "lsof -i :" + to_string(port);
+
+  // Create a pipe for capturing output
+  FILE *pipe = popen(command.c_str(), "r");
+  if (!pipe) {
+    cerr << "Error opening pipe" << endl;
+    return "";
+  }
+
+  // Read the output from the pipe
+  string output;
+  char buffer[128];
+  while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+    output += buffer;
+  }
+
+  // Close the pipe
+  pclose(pipe);
+
+  return output;
+}
+
+int main() {
+  int port = 8080;
+  string output = get_port_usage(port);
+
+  if (output.empty()) {
+    cerr << "Error getting port " << port << " usage" << endl;
+  } else {
+    cout << "Port " << port << " usage:" << endl << output << endl;
+  }
+
+  return 0;
+}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ec586e2ed..bb0c99587 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -508,7 +508,7 @@ struct llama_server_context
         default_generation_settings_for_props = get_formatted_generation(slots.front());
         default_generation_settings_for_props["seed"] = -1;
 
-        batch = llama_batch_init(n_ctx_slot, 0, params.n_parallel);     // this works fine with the slot context and saves VRAM
+        batch = llama_batch_init(n_ctx, 0, params.n_parallel);     // this works fine with the slot context and saves VRAM
     }
 
     std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
@@ -567,9 +567,13 @@ struct llama_server_context
 
         for (llama_client_slot & slot : slots)
         {
+            if (slot.state == IDLE && slot.command != LOAD_PROMPT) {
+                LOG_TEE("Hijacking the first available slot %d\n", slot.id);
+                return &slot;
+            }
             if (slot.id == id && slot.available())
             {
-                LOG_TEE("Using available slot called by id: %d", slot.id);
+                LOG_TEE("Using if-based available slot called by id: %d", slot.id);
                 return &slot;
             }
 
@@ -577,7 +581,7 @@ struct llama_server_context
             {
                 last_used = &slot;
                 t_last = slot.t_last_used;
-                LOG_TEE("reusing earliest released slot id: %d\n", slot.id);
+                LOG_TEE("Using time-based slot id: %d\n", slot.id);
                 break;
             }
         }
@@ -1441,7 +1445,8 @@ struct llama_server_context
         switch (task.type)
         {
             case TASK_TYPE_COMPLETION: {
-                llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
+                printf("Task data %d.\n", task.id);
+                llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); // returns nullptr if no slot available
                 if (slot == nullptr)
                 {
                     // if no slot is available, we defer this task for processing later
@@ -2006,6 +2011,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
     printf("  --log-disable             disables logging to a file.\n");
     printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
+    printf("  -skvg, --show-graphics    enable graphics displaying kvcache occupancy (default: false)");
     printf("\n");
     printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
     printf("  --override-kv KEY=TYPE:VALUE\n");
@@ -2716,7 +2722,7 @@ int main(int argc, char **argv)
         svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
             json slots;
             for (llama_client_slot & slot : llama.slots) {
-                json slot_data = llama.get_formated_generation(slot);
+                json slot_data = llama.get_formatted_generation(slot);
                 slot_data["id"] = slot.id;
                 slot_data["task_id"] = slot.task_id;
                 slot_data["state"] = slot.state;
@@ -2902,6 +2908,7 @@ int main(int argc, char **argv)
                 json data = json::parse(req.body);
                 const int task_id = llama.queue_tasks.get_new_id();
                 llama.queue_results.add_waiting_task_id(task_id);
+                LOG_TEE("Initiated new task %d.\n", task_id);
                 llama.request_completion(task_id, data, false, false, -1);
                 if (!json_value(data, "stream", false)) {
                     std::string completion_text;
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 0ee670dba..f016221a6 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -53,7 +53,7 @@ enum task_type {
 };
 
 struct task_server {
-    int id = -1; // to be filled by llama_server_queue
+    int id = -1; // for any instance, task id is not assigned yet; to be filled by llama_server_queue
     int target_id;
     task_type type;
     json data;
@@ -162,6 +162,9 @@ template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value)
 {
     // Fallback null to default value
+    if (body.contains(key) && !body.at(key).is_null()) {
+        LOG_TEE("Body at %s in %d\n", key.c_str(), int(body.at(key)));
+    }
     return body.contains(key) && !body.at(key).is_null()
         ? body.value(key, default_value)
         : default_value;
@@ -238,6 +241,7 @@ struct llama_server_queue {
             task.id = id++;
         }
         queue_tasks.push_back(std::move(task));
+        //LOG_TEE("Queue now has %2zu members.\n", queue_tasks.size());
         condition_tasks.notify_one();
         return task.id;
     }
@@ -246,11 +250,13 @@ struct llama_server_queue {
     void defer(task_server task) {
         std::unique_lock<std::mutex> lock(mutex_tasks);
         queue_tasks_deferred.push_back(std::move(task));
+        LOG_TEE("Deferred task queue now has %3zu members.\n", queue_tasks_deferred.size());
     }
 
-    // Get the next id for creating anew task
+    // Get the next id for creating a new task
     int get_new_id() {
         std::unique_lock<std::mutex> lock(mutex_tasks);
+        LOG_TEE("New task id returned with value %d.\n", id);
         return id++;
     }
 
@@ -293,7 +299,7 @@ struct llama_server_queue {
         running = true;
         while (true) {
             // new task arrived
-            LOG_VERBOSE("have new task", {});
+            LOG_VERBOSE("have new task number %d.\n", {});
             {
                 while (true)
                 {
@@ -305,10 +311,8 @@ struct llama_server_queue {
                     task_server task = queue_tasks.front();
                     queue_tasks.erase(queue_tasks.begin());
                     lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {});
                     callback_new_task(task);
                 }
-                LOG_VERBOSE("callback_all_task_finished", {});
                 // process and update all the multitasks
                 auto queue_iterator = queue_multitasks.begin();
                 while (queue_iterator != queue_multitasks.end())
@@ -326,7 +330,7 @@ struct llama_server_queue {
                         ++queue_iterator;
                     }
                 }
-                // all tasks in the current loop is finished
+                // all tasks in the current loop are finished
                 callback_all_task_finished();
             }
             LOG_VERBOSE("wait for new task", {});