Major changes to server plus kvcache graphic

2024-02-16 11:16:13 +00:00 · 2024-02-16 11:16:13 +00:00 · 5249985578
commit 5249985578
parent 326b418b59
9 changed files with 1033 additions and 532 deletions
--- a/Llamaserver.py
+++ b/Llamaserver.py
@ -0,0 +1,110 @@
 from queue import Queue
 import threading
 import requests
 import json
 def print_dict(data):
    for k, v in data.items():
        if isinstance(v, dict):
            print_dict(v)
        elif isinstance(v, list):
            for entry in v:
                print_dict(entry)
        elif k == "content":          
            print(f"Key: {k:>30}: {v}")
    return
 def print_response(text):
    print(text)
 def make_empty_bar(num_requests):
    bar = []
    for i in range(num_requests):
        bar.append("\u2589")
    bar = ' '.join(bar)
    bar = bar.replace(' ','')
    print(f"Bar is now {bar}.")
    return bar
 def make_progress_bar(bar, count, num_requests):
    stride1 = len("\u2589")
    stride2 = len("\u23F1")
    for i in range(num_requests):
        if i == count:
            print(f"Bar position {i} is {bar[i]}")
            bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
    print(f"Bar is now {bar}")
    return bar
 def send_request(q, question, event, count, num_requests):
    global bar
    data = {'prompt': question}
    try:
        response = requests.post(url, headers=headers, json=data)
        if response.status_code in [200,300]:
            print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
            print(f"Status Code for {question}: {response.status_code}")
            print(f"Response to {question}:\n")
            print_dict(json.loads(response.text))
            # put the response text in the queue
            q.put(response.text)
            if not q.empty():
                print(f"Completed task {count} / {num_requests}\n")
                bar = make_progress_bar(bar, count, num_requests)
            q.task_done()
        elif response.status_code == 429 and not q.empty():
            event.set()
            print("Server return too many requests; back off!! Reset event.")
    except Exception as e:
        print(f"Server returned exception error {e}")
 if __name__ == "__main__":
    global bar
    url = "http://localhost:8080/completion"
    num_requests = 40
    q = Queue(maxsize = 40)
    threads = []
    bar = make_empty_bar(num_requests)
    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json',  
        'User-Agent': 'Llamaserver.py'
        }
    country_list = ["France", "Germany", "China", "USA", "Italy", "India",
                    "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
                    "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
                    "Bulgaria", "Romania", "Finland", "Sweden", "Norway", "Denmark", "Tanzania", "Israel",
                    "Latvia", "Lithuania", "Estonia", "Pakistan", "Sri Lanka", "Malawi", "Mozambique"]
    for i in range(num_requests):
        country = country_list[i % len(country_list)]
        question = f"When was the first democratic election (if any) in {country}?"
        # NOTE: don't pass the parameter as a function call; pass in args
        print(f"Processing request {i} / {num_requests}: {question}\n")
        event = threading.Event()
        t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) 
        t.start()
        threads.append(t)
        # input("Any key",)
    for thread in threads:
        thread.join()   # wait for all threads to finish
    print("FINISHED AND GETTING RESULTS")
    while not q.empty():
        text = q.get()  
        print_dict(json.loads(text))
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -17,6 +17,7 @@ else()
    add_subdirectory(batched-bench)
    add_subdirectory(beam-search)
    add_subdirectory(benchmark)
    add_subdirectory(cmap-example)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(finetune)
--- a/examples/cmap-example/CMakeLists.txt
+++ b/examples/cmap-example/CMakeLists.txt
@ -0,0 +1,9 @@
 set(TARGET kvcacheviz)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} kvcacheviz.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/cmap-example/KVcacheViz.py
+++ b/examples/cmap-example/KVcacheViz.py
@ -0,0 +1,29 @@
 # A simple illustration of how to represent cache occupancy
 # graphically using unicvode blocks
 # which are generated using print("\u2588"), print("\u2591")
 from time import sleep
 import random
 CACHE_SIZE = 50 
 used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
 def visualize_kv_cache(used_blocks, total_size):
    cache_viz = "["
    tot_used = 0
    for i in range(len(used_blocks)):
        # cache_viz += "█" * used_blocks[i]
        cache_viz += "\u2589" * used_blocks[i]
        cache_viz += "░" * (total_size - used_blocks[i])
        cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
        tot_used += used_blocks[i]
        #print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
    print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
 while True:
    visualize_kv_cache(used_blocks, CACHE_SIZE)
    sleep(0.5)
    used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks
--- a/examples/cmap-example/kvcacheviz.cpp
+++ b/examples/cmap-example/kvcacheviz.cpp
@ -0,0 +1,68 @@
 /*
 A utility to represent the kv-cache occupancy graphically
 Takes as parameters
 - total cache size (-c)
 - number of simultaneous accesses/slots (-np)
 - a parameter related to the display context (max window width - data display requirements)
 It then uses a trick borrowed from tqdm to display occupancy
 TODO: Show contiguous space and block availability
 */
 #include <iostream>
 #include <iomanip>
 #include <vector>
 #include <cstdlib> // for rand()
 static void show_kvcache(
  std::vector<int> used_blocks,
  int cache_size,
  int max_length
 ) {
  int num_blocks = used_blocks.size();
  int slot_cache_size = cache_size / num_blocks;
  while(true) {
    // Print visualization after erasing the current line
    for(int i=0; i<num_blocks; i++) {
      for(int j=0; j<max_length; j++) {
        if(j<used_blocks[i] * max_length / slot_cache_size) {
          std::cout << "\033[94m█\033[0m";
        }
        //else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
        //  std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
        //}
        else {
          std::cout << "\033[91m█\033[0m";
        }
      }
    std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
    }
  std::cout << "{";
  std::string upcursor = "\033[K\033[A\033[K";
  for(int i=0; i < num_blocks; i++){
    //std::cout << used_blocks[i] << " ";
    upcursor += "\033[A\033[K";
  }
  // Remove first element
  used_blocks.erase(used_blocks.begin());
  // Add new random block at the end
  u_int new_block = rand() % slot_cache_size;
  used_blocks.push_back(new_block);
 // Adjust the cursor so that the display overwrites itself
  upcursor += "\033[A\033[K";
  std::cout << "}" << std::endl;
  std::cin.get();
  std::cout << upcursor;
  }
 }
 int main() {
  std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
  int cache_size = 65536;
  int max_length = 128;
  show_kvcache(used_blocks, cache_size, max_length);
  }
--- a/examples/main/Adjustmain.py
+++ b/examples/main/Adjustmain.py
@ -0,0 +1,17 @@
 # Adjust the main.cpp file
 # to include the (Tokens used:) data output
 try:
    with open("/Users/edsilm2/llama.cpp/examples/main/main.cpp", 'r+') as file:
        main = file.read()
        search_str = 'printf("\\n> ");'
        new_str = 'printf("\\033[31m(Tokens used: %d / %d)\\033[0m\\nJCP: ", n_past, n_ctx);'
        main = main.replace(search_str, new_str)
        file.seek(0)
        search_str = 'context full and n_predict == -%d => stopping'
        new_str = 'context full and n_predict == %d => stopping'
        main = main.replace(search_str, new_str)
        file.seek(0)
        file.write(main)
 except FileNotFoundError as fe:
    print(f"Error searching for main.cpp: {fe}")
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -522,7 +522,7 @@ int main(int argc, char ** argv) {
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                    if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        LOG_TEE("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
                        break;
                    }
@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
                LOG("waiting for user input\n");
                if (params.instruct || params.chatml) {
-                    printf("\n> ");
+                    printf("\033[31m(Tokens used: %d / %d)\033[0m\nJCP: ", n_past, n_ctx);
                }
                if (params.input_prefix_bos) {
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
@ -19,7 +19,7 @@
 #endif
 #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
-#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
+#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5   // originally 5
 #endif
 #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
@ -110,7 +110,7 @@
 #endif
 #ifndef CPPHTTPLIB_LISTEN_BACKLOG
-#define CPPHTTPLIB_LISTEN_BACKLOG 5
+#define CPPHTTPLIB_LISTEN_BACKLOG 15    // originally 5
 #endif
 /*
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp