Major changes to server plus kvcache graphic

2024-02-16 11:16:13 +00:00 · 2024-02-16 11:16:13 +00:00 · 5249985578
commit 5249985578
parent 326b418b59
9 changed files with 1033 additions and 532 deletions
--- a/Llamaserver.py
+++ b/Llamaserver.py
@ -0,0 +1,110 @@
+from queue import Queue
+import threading
+import requests
+import json
+
+def print_dict(data):
+    for k, v in data.items():
+        if isinstance(v, dict):
+            print_dict(v)
+        elif isinstance(v, list):
+            for entry in v:
+                print_dict(entry)
+        elif k == "content":          
+            print(f"Key: {k:>30}: {v}")
+    return
+
+def print_response(text):
+    print(text)
+
+def make_empty_bar(num_requests):
+    bar = []
+    for i in range(num_requests):
+        bar.append("\u2589")
+    bar = ' '.join(bar)
+    bar = bar.replace(' ','')
+    print(f"Bar is now {bar}.")
+    return bar
+
+def make_progress_bar(bar, count, num_requests):
+    stride1 = len("\u2589")
+    stride2 = len("\u23F1")
+    for i in range(num_requests):
+        if i == count:
+            print(f"Bar position {i} is {bar[i]}")
+            bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
+    print(f"Bar is now {bar}")
+    return bar
+
+def send_request(q, question, event, count, num_requests):
+
+    global bar
+
+    data = {'prompt': question}
+    
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        if response.status_code in [200,300]:
+            print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
+            print(f"Status Code for {question}: {response.status_code}")
+            print(f"Response to {question}:\n")
+            print_dict(json.loads(response.text))
+            # put the response text in the queue
+            q.put(response.text)
+            if not q.empty():
+                print(f"Completed task {count} / {num_requests}\n")
+                bar = make_progress_bar(bar, count, num_requests)
+            q.task_done()
+        elif response.status_code == 429 and not q.empty():
+            event.set()
+            print("Server return too many requests; back off!! Reset event.")
+    except Exception as e:
+        print(f"Server returned exception error {e}")
+
+if __name__ == "__main__":
+
+    global bar
+    
+    url = "http://localhost:8080/completion"
+
+    num_requests = 40
+    q = Queue(maxsize = 40)
+    threads = []
+
+    bar = make_empty_bar(num_requests)
+
+    headers = {
+        'Content-Type': 'application/json',
+        'Accept': 'application/json',  
+        'User-Agent': 'Llamaserver.py'
+        }
+
+    country_list = ["France", "Germany", "China", "USA", "Italy", "India",
+                    "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
+                    "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
+                    "Bulgaria", "Romania", "Finland", "Sweden", "Norway", "Denmark", "Tanzania", "Israel",
+                    "Latvia", "Lithuania", "Estonia", "Pakistan", "Sri Lanka", "Malawi", "Mozambique"]
+    
+    for i in range(num_requests):
+        country = country_list[i % len(country_list)]
+        question = f"When was the first democratic election (if any) in {country}?"
+        # NOTE: don't pass the parameter as a function call; pass in args
+        print(f"Processing request {i} / {num_requests}: {question}\n")
+        event = threading.Event()
+        t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) 
+        t.start()
+        threads.append(t)
+        # input("Any key",)
+
+    for thread in threads:
+        thread.join()   # wait for all threads to finish
+
+    print("FINISHED AND GETTING RESULTS")
+    while not q.empty():
+        text = q.get()  
+        print_dict(json.loads(text))
+
+
+    
+
+
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -17,6 +17,7 @@ else()
    add_subdirectory(batched-bench)
    add_subdirectory(beam-search)
    add_subdirectory(benchmark)
+    add_subdirectory(cmap-example)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(finetune)
--- a/examples/cmap-example/CMakeLists.txt
+++ b/examples/cmap-example/CMakeLists.txt
@ -0,0 +1,9 @@
+set(TARGET kvcacheviz)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} kvcacheviz.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/cmap-example/KVcacheViz.py
+++ b/examples/cmap-example/KVcacheViz.py
@ -0,0 +1,29 @@
+# A simple illustration of how to represent cache occupancy
+# graphically using unicvode blocks
+# which are generated using print("\u2588"), print("\u2591")
+
+from time import sleep
+import random
+
+CACHE_SIZE = 50 
+used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
+
+def visualize_kv_cache(used_blocks, total_size):
+    cache_viz = "["
+    tot_used = 0
+    for i in range(len(used_blocks)):
+        # cache_viz += "█" * used_blocks[i]
+        cache_viz += "\u2589" * used_blocks[i]
+        cache_viz += "░" * (total_size - used_blocks[i])
+        cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
+        tot_used += used_blocks[i]
+
+        #print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
+
+    print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
+    
+
+while True:
+    visualize_kv_cache(used_blocks, CACHE_SIZE)
+    sleep(0.5)
+    used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks
--- a/examples/cmap-example/kvcacheviz.cpp
+++ b/examples/cmap-example/kvcacheviz.cpp
@ -0,0 +1,68 @@
+/*
+A utility to represent the kv-cache occupancy graphically
+Takes as parameters
+- total cache size (-c)
+- number of simultaneous accesses/slots (-np)
+- a parameter related to the display context (max window width - data display requirements)
+It then uses a trick borrowed from tqdm to display occupancy
+TODO: Show contiguous space and block availability
+*/
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cstdlib> // for rand()
+
+static void show_kvcache(
+  std::vector<int> used_blocks,
+  int cache_size,
+  int max_length
+) {
+  int num_blocks = used_blocks.size();
+  int slot_cache_size = cache_size / num_blocks;
+
+  while(true) {
+
+    // Print visualization after erasing the current line
+    for(int i=0; i<num_blocks; i++) {
+      for(int j=0; j<max_length; j++) {
+        if(j<used_blocks[i] * max_length / slot_cache_size) {
+          std::cout << "\033[94m█\033[0m";
+        }
+        //else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
+        //  std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
+        //}
+        else {
+          std::cout << "\033[91m█\033[0m";
+        }
+      }
+    std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
+    }
+  std::cout << "{";
+  std::string upcursor = "\033[K\033[A\033[K";
+
+  for(int i=0; i < num_blocks; i++){
+    //std::cout << used_blocks[i] << " ";
+    upcursor += "\033[A\033[K";
+  }
+
+  // Remove first element
+  used_blocks.erase(used_blocks.begin());
+
+  // Add new random block at the end
+  u_int new_block = rand() % slot_cache_size;
+  used_blocks.push_back(new_block);
+
+// Adjust the cursor so that the display overwrites itself
+  upcursor += "\033[A\033[K";
+  std::cout << "}" << std::endl;
+  std::cin.get();
+  std::cout << upcursor;
+  }
+}
+
+int main() {
+  std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
+  int cache_size = 65536;
+  int max_length = 128;
+  show_kvcache(used_blocks, cache_size, max_length);
+  }
--- a/examples/main/Adjustmain.py
+++ b/examples/main/Adjustmain.py
@ -0,0 +1,17 @@
+# Adjust the main.cpp file
+# to include the (Tokens used:) data output
+
+try:
+    with open("/Users/edsilm2/llama.cpp/examples/main/main.cpp", 'r+') as file:
+        main = file.read()
+        search_str = 'printf("\\n> ");'
+        new_str = 'printf("\\033[31m(Tokens used: %d / %d)\\033[0m\\nJCP: ", n_past, n_ctx);'
+        main = main.replace(search_str, new_str)
+        file.seek(0)
+        search_str = 'context full and n_predict == -%d => stopping'
+        new_str = 'context full and n_predict == %d => stopping'
+        main = main.replace(search_str, new_str)
+        file.seek(0)
+        file.write(main)
+except FileNotFoundError as fe:
+    print(f"Error searching for main.cpp: {fe}")
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -522,7 +522,7 @@ int main(int argc, char ** argv) {
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                    if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        LOG_TEE("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
                        break;
                    }

@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
                LOG("waiting for user input\n");

                if (params.instruct || params.chatml) {
-                    printf("\n> ");
+                    printf("\033[31m(Tokens used: %d / %d)\033[0m\nJCP: ", n_past, n_ctx);
                }

                if (params.input_prefix_bos) {
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
@ -19,7 +19,7 @@
 #endif

 #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
-#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
+#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5   // originally 5
 #endif

 #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
@ -110,7 +110,7 @@
 #endif

 #ifndef CPPHTTPLIB_LISTEN_BACKLOG
-#define CPPHTTPLIB_LISTEN_BACKLOG 5
+#define CPPHTTPLIB_LISTEN_BACKLOG 15    // originally 5
 #endif

 /*
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp