Major changes to server plus kvcache graphic

This commit is contained in:
pudepiedj 2024-02-16 11:16:13 +00:00
parent 326b418b59
commit 5249985578
9 changed files with 1033 additions and 532 deletions

110
Llamaserver.py Normal file
View file

@ -0,0 +1,110 @@
from queue import Queue
import threading
import requests
import json
def print_dict(data):
for k, v in data.items():
if isinstance(v, dict):
print_dict(v)
elif isinstance(v, list):
for entry in v:
print_dict(entry)
elif k == "content":
print(f"Key: {k:>30}: {v}")
return
def print_response(text):
print(text)
def make_empty_bar(num_requests):
bar = []
for i in range(num_requests):
bar.append("\u2589")
bar = ' '.join(bar)
bar = bar.replace(' ','')
print(f"Bar is now {bar}.")
return bar
def make_progress_bar(bar, count, num_requests):
stride1 = len("\u2589")
stride2 = len("\u23F1")
for i in range(num_requests):
if i == count:
print(f"Bar position {i} is {bar[i]}")
bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
print(f"Bar is now {bar}")
return bar
def send_request(q, question, event, count, num_requests):
global bar
data = {'prompt': question}
try:
response = requests.post(url, headers=headers, json=data)
if response.status_code in [200,300]:
print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
print(f"Status Code for {question}: {response.status_code}")
print(f"Response to {question}:\n")
print_dict(json.loads(response.text))
# put the response text in the queue
q.put(response.text)
if not q.empty():
print(f"Completed task {count} / {num_requests}\n")
bar = make_progress_bar(bar, count, num_requests)
q.task_done()
elif response.status_code == 429 and not q.empty():
event.set()
print("Server return too many requests; back off!! Reset event.")
except Exception as e:
print(f"Server returned exception error {e}")
if __name__ == "__main__":
global bar
url = "http://localhost:8080/completion"
num_requests = 40
q = Queue(maxsize = 40)
threads = []
bar = make_empty_bar(num_requests)
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json',
'User-Agent': 'Llamaserver.py'
}
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
"Bulgaria", "Romania", "Finland", "Sweden", "Norway", "Denmark", "Tanzania", "Israel",
"Latvia", "Lithuania", "Estonia", "Pakistan", "Sri Lanka", "Malawi", "Mozambique"]
for i in range(num_requests):
country = country_list[i % len(country_list)]
question = f"When was the first democratic election (if any) in {country}?"
# NOTE: don't pass the parameter as a function call; pass in args
print(f"Processing request {i} / {num_requests}: {question}\n")
event = threading.Event()
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
t.start()
threads.append(t)
# input("Any key",)
for thread in threads:
thread.join() # wait for all threads to finish
print("FINISHED AND GETTING RESULTS")
while not q.empty():
text = q.get()
print_dict(json.loads(text))

View file

@ -17,6 +17,7 @@ else()
add_subdirectory(batched-bench) add_subdirectory(batched-bench)
add_subdirectory(beam-search) add_subdirectory(beam-search)
add_subdirectory(benchmark) add_subdirectory(benchmark)
add_subdirectory(cmap-example)
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding) add_subdirectory(embedding)
add_subdirectory(finetune) add_subdirectory(finetune)

View file

@ -0,0 +1,9 @@
set(TARGET kvcacheviz)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} kvcacheviz.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,29 @@
# A simple illustration of how to represent cache occupancy
# graphically using unicvode blocks
# which are generated using print("\u2588"), print("\u2591")
from time import sleep
import random
CACHE_SIZE = 50
used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
def visualize_kv_cache(used_blocks, total_size):
cache_viz = "["
tot_used = 0
for i in range(len(used_blocks)):
# cache_viz += "█" * used_blocks[i]
cache_viz += "\u2589" * used_blocks[i]
cache_viz += "" * (total_size - used_blocks[i])
cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
tot_used += used_blocks[i]
#print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
while True:
visualize_kv_cache(used_blocks, CACHE_SIZE)
sleep(0.5)
used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks

View file

@ -0,0 +1,68 @@
/*
A utility to represent the kv-cache occupancy graphically
Takes as parameters
- total cache size (-c)
- number of simultaneous accesses/slots (-np)
- a parameter related to the display context (max window width - data display requirements)
It then uses a trick borrowed from tqdm to display occupancy
TODO: Show contiguous space and block availability
*/
#include <iostream>
#include <iomanip>
#include <vector>
#include <cstdlib> // for rand()
static void show_kvcache(
std::vector<int> used_blocks,
int cache_size,
int max_length
) {
int num_blocks = used_blocks.size();
int slot_cache_size = cache_size / num_blocks;
while(true) {
// Print visualization after erasing the current line
for(int i=0; i<num_blocks; i++) {
for(int j=0; j<max_length; j++) {
if(j<used_blocks[i] * max_length / slot_cache_size) {
std::cout << "\033[94m█\033[0m";
}
//else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
// std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
//}
else {
std::cout << "\033[91m█\033[0m";
}
}
std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
}
std::cout << "{";
std::string upcursor = "\033[K\033[A\033[K";
for(int i=0; i < num_blocks; i++){
//std::cout << used_blocks[i] << " ";
upcursor += "\033[A\033[K";
}
// Remove first element
used_blocks.erase(used_blocks.begin());
// Add new random block at the end
u_int new_block = rand() % slot_cache_size;
used_blocks.push_back(new_block);
// Adjust the cursor so that the display overwrites itself
upcursor += "\033[A\033[K";
std::cout << "}" << std::endl;
std::cin.get();
std::cout << upcursor;
}
}
int main() {
std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
int cache_size = 65536;
int max_length = 128;
show_kvcache(used_blocks, cache_size, max_length);
}

View file

@ -0,0 +1,17 @@
# Adjust the main.cpp file
# to include the (Tokens used:) data output
try:
with open("/Users/edsilm2/llama.cpp/examples/main/main.cpp", 'r+') as file:
main = file.read()
search_str = 'printf("\\n> ");'
new_str = 'printf("\\033[31m(Tokens used: %d / %d)\\033[0m\\nJCP: ", n_past, n_ctx);'
main = main.replace(search_str, new_str)
file.seek(0)
search_str = 'context full and n_predict == -%d => stopping'
new_str = 'context full and n_predict == %d => stopping'
main = main.replace(search_str, new_str)
file.seek(0)
file.write(main)
except FileNotFoundError as fe:
print(f"Error searching for main.cpp: {fe}")

View file

@ -522,7 +522,7 @@ int main(int argc, char ** argv) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) { if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (params.n_predict == -2) { if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); LOG_TEE("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
break; break;
} }
@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
LOG("waiting for user input\n"); LOG("waiting for user input\n");
if (params.instruct || params.chatml) { if (params.instruct || params.chatml) {
printf("\n> "); printf("\033[31m(Tokens used: %d / %d)\033[0m\nJCP: ", n_past, n_ctx);
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {

View file

@ -19,7 +19,7 @@
#endif #endif
#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT #ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 #define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 // originally 5
#endif #endif
#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND #ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
@ -110,7 +110,7 @@
#endif #endif
#ifndef CPPHTTPLIB_LISTEN_BACKLOG #ifndef CPPHTTPLIB_LISTEN_BACKLOG
#define CPPHTTPLIB_LISTEN_BACKLOG 5 #define CPPHTTPLIB_LISTEN_BACKLOG 15 // originally 5
#endif #endif
/* /*

File diff suppressed because it is too large Load diff