Various server updates

2024-02-25 09:29:31 +00:00 · 2024-02-25 09:29:31 +00:00 · c80d429c42
commit c80d429c42
parent a9dd5f3769
7 changed files with 79 additions and 19 deletions
--- a/Llamaserver.py
+++ b/Llamaserver.py
@ -52,8 +52,12 @@ def send_request(q, question, event, count, num_requests):
    delay = 0.1

    global bar
-
-    data = {'prompt': question}
+    
+    system = "You are a helpful assistant who answers all requests \
+courteously and accurately without undue repetion. \
+you pay close attention to the nuance of a question and response accordingly."
+    
+    data = {'system': system, 'prompt': question}
    
    try:
        response = requests.post(url, headers=headers, json=data)
@ -114,7 +118,7 @@ if __name__ == "__main__":
    
    for i in range(num_requests):
        country = country_list[i % len(country_list)]
-        question = f"What was the total population of {country} in 2018?"
+        question = f"Tell me the political history of {country} up to 2018."
        # NOTE: don't pass the parameter as a function call; pass in args
        print(f"Processing request {i} / {num_requests}: {question}\n")
        event = threading.Event()
--- a/cmakescript.sh
+++ b/cmakescript.sh
@ -0,0 +1,13 @@
+# simple automation of cmake
+# usage is ./cmakescript.sh Debug || Release
+
+rm -r build
+cmake -B build
+cd build
+
+if [ $# -eq 1 ] && [[ "$1" == "Debug" || "$1" == "Release" ]]; then
+  cmake --build . --config "$1"
+else
+  echo "Usage: $0 (Debug|Release)"
+  exit 1
+fi
--- a/examples/cmap-example/CMakeLists.txt
+++ b/examples/cmap-example/CMakeLists.txt
@ -1,8 +1,8 @@
-set(TARGET TCPshellscript)
+set(TARGET TCPportquiz)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} TCPshellscript.cpp)
+add_executable(${TARGET} TCPportquiz.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/cmap-example/TCPshellscript.cpp
+++ b/examples/cmap-example/TCPshellscript.cpp
--- a/examples/cmap-example/cursor.cpp
+++ b/examples/cmap-example/cursor.cpp
@ -0,0 +1,26 @@
+// just trying to get the cursor position
+
+#include <cstdlib>
+
+struct CursorPos {
+  int x;
+  int y;
+};
+
+static CursorPos getCursorPos() {
+
+  // Get text cursor position
+  auto cursorPos = getCursorPos();
+
+  // Assign to struct
+  CursorPos pos;
+  pos.x = cursorPos.x;
+  pos.y = cursorPos.y;
+
+  return pos;
+}
+
+int main() {
+    CursorPos cursor = getCursorPos();
+    printf("The x co-ordinate of the cursor is %zu\n; the y co-ordinate of the cursor is %zu\n", cursor.x, cursor.y);
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -305,7 +305,7 @@ struct llama_client_slot

    void print_timings(llama_client_slot &slot, bool flag = false) const {
        if (flag) {
-            printf("\033[21;0H");        // needs to be sensitive to the number of slots
+            printf("\033[5;0H");        // needs to be sensitive to the number of slots
        };
        LOG_TEE("Finished processing slot %d.\n", slot.id);
        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
@ -315,7 +315,7 @@ struct llama_client_slot
        LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);

        if (flag) {
-            printf("\033[25;0HPress any key ... ");
+            printf("\033[KPress any key ... \n");
            getchar();
        }
    }
@ -323,11 +323,11 @@ struct llama_client_slot

 // experimental/diagostic graphic to show kvcache status
 // requires just `slots` and `params.n_ctx` as parameters
-static void kvgraphics(std::vector<llama_client_slot>& slots, int cache_size) {
+static void kvgraphics(std::vector<llama_client_slot>& slots) {

    int max_length = 128;
    int num_blocks = slots.size();
-    size_t slot_cache_size = cache_size / num_blocks;
+    size_t slot_cache_size = slots[0].n_ctx;
    bool cls_flag = true;   // this flag only prevents repeated cls inside one call
    std::string slot_symbol1 = "";
    std::string slot_symbol2 = "";
@ -344,13 +344,16 @@ static void kvgraphics(std::vector<llama_client_slot>& slots, int cache_size) {

    // Print visualization
    // Always start at the top left of the window (H means 'move cursor to this position'; 2J = cls)
+    // See eblow for a rethink because controlling log printing is such a pain in C++11
    // Only clear the screen the first time round
    if (cls_flag) {
-        printf("\033[2J");
+        // printf("\033[2J");
        cls_flag = false;
    }
    printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n");

+    // we can know and control how many lines of output we are printing so just start below that and fix the graphics location
+    printf("\033[%d;0H", 10);
    for(int i=0; i<num_blocks; i++) {
        printf("\033[K");  // clear the current line
        for(int j=0; j < max_length; j++) {
@ -384,7 +387,8 @@ static void kvgraphics(std::vector<llama_client_slot>& slots, int cache_size) {
        }
    printf(" %4zu/%5zu %2d %s %s %s\n", slots[i].cache_tokens.size(), slot_cache_size, slots[i].id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str());
    }
-    printf("\n\033[%dJ", num_blocks+5);     // move cursor to end of cache display
+    printf("\033[5;0H");   // just start two lines below the heading
+    //printf("\n\033[%d;0H\033[%dJ", 10, num_blocks+5);     // move cursor to end of cache display and clear thereafter
 }

 struct llama_server_context
@ -494,7 +498,7 @@ struct llama_server_context
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;

-            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+            LOG_TEE(" -> Slot %2i - max context: %i\n", slot.id, n_ctx_slot);

            const int ga_n = params.grp_attn_n;
            const int ga_w = params.grp_attn_w;
@ -504,7 +508,7 @@ struct llama_server_context
                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-                LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
+                LOG_TEE(" -> Slot %2i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
            }

            slot.ga_i = 0;
@ -581,6 +585,7 @@ struct llama_server_context

        for (llama_client_slot & slot : slots)
        {
+            printf("\033[5;0H");
            if (slot.id == -1 && slot.available())
            {
                LOG_TEE("Unallocated task now using slot %d", slot.id);
@ -1438,6 +1443,7 @@ struct llama_server_context
                // why should task.data already contain a slot_id key when we haven't allocated it?
                // because if it doesnt the returned value will be -1; what makes it anything else?
                int requested_slot = json_value(task.data, "slot_id", -1);
+                printf("\033[5;0H\033[K");
                LOG_TEE("Task %d requesting slot %d\n", task.id, requested_slot);

                // why are we suddenly using 'slot' as a pointer here - confusing?
@ -1449,6 +1455,7 @@ struct llama_server_context
                    queue_tasks.defer(task);
                    break;
                } else {
+                    printf("\033[5;0H\033[K");
                    LOG_TEE("Activating slot %d.\n", (*slot).id);
                }

@ -1554,6 +1561,7 @@ struct llama_server_context
                    const int n_left    = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
                    const int n_discard = n_left / 2;       // is this arbitrary?

+                    printf("\033[5;0H\033[K");
                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
                    llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
                    llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
@ -1588,6 +1596,7 @@ struct llama_server_context
                slot.command = NONE;
                slot.t_last_used = ggml_time_us();

+                printf("\033[6;0H\033[K");
                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
                queue_tasks.notify_slot_changed();  // why don't we immediately reallocate the released slot without waiting? Is this what -cb does?

@ -1736,6 +1745,7 @@ struct llama_server_context
                            slot.ga_i = ga_i;
                        }

+                        printf("\033[7;0H\033[K");
                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                    }

@ -1752,6 +1762,7 @@ struct llama_server_context
                        }
                    }

+                    printf("\033[5;0H\033[K");
                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);

                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
@ -1936,7 +1947,7 @@ struct llama_server_context

        // we are still inside llama_server_context so we can use an unqualified parameter
        if (skvgraphics) {
-            kvgraphics(slots, params.n_ctx);
+            kvgraphics(slots);
            }

        return true;
@ -2916,7 +2927,8 @@ int main(int argc, char **argv)
                    return;
                }
                // it appears that here we first get ONE request to parse; then TEN; then ONE-by-ONE
-                printf("Request body to parse: %s", req.body.c_str());
+                printf("\033[5;0H\033[K");
+                LOG_TEE("Request body to parse: %s", req.body.c_str());
                if (llama.skvinteract) {
                    getchar();
                }
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -36,6 +36,7 @@ extern bool server_verbose;
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)

+
 //
 // parallel
 //
@ -244,7 +245,9 @@ struct llama_server_queue {
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
-        LOG_TEE("Deferred task queue now has %3zu members.\n", queue_tasks_deferred.size());
+        printf("\033[1;50H*** ");
+        LOG_TEE("Deferred queue now has %3zu members.\n", queue_tasks_deferred.size());
+        printf("\033[5;0H");
    }

    // Get the next id for creating a new task
@ -390,15 +393,17 @@ struct llama_server_response {
    void add_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
-        printf("\033[21;0H");
+        printf("\033[1;50H*** ");
        LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size());
+        printf("\033[5;0H");
    }

    void remove_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
-        printf("\033[21;0H");
+        printf("\033[2;50H*** ");
        LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
+        printf("\033[5;0H");
    }

    // This function blocks the thread until there is a response for this task_id