Major changes to server plus kvcache graphic
This commit is contained in:
parent
326b418b59
commit
5249985578
9 changed files with 1033 additions and 532 deletions
110
Llamaserver.py
Normal file
110
Llamaserver.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
from queue import Queue
|
||||
import threading
|
||||
import requests
|
||||
import json
|
||||
|
||||
def print_dict(data):
|
||||
for k, v in data.items():
|
||||
if isinstance(v, dict):
|
||||
print_dict(v)
|
||||
elif isinstance(v, list):
|
||||
for entry in v:
|
||||
print_dict(entry)
|
||||
elif k == "content":
|
||||
print(f"Key: {k:>30}: {v}")
|
||||
return
|
||||
|
||||
def print_response(text):
|
||||
print(text)
|
||||
|
||||
def make_empty_bar(num_requests):
|
||||
bar = []
|
||||
for i in range(num_requests):
|
||||
bar.append("\u2589")
|
||||
bar = ' '.join(bar)
|
||||
bar = bar.replace(' ','')
|
||||
print(f"Bar is now {bar}.")
|
||||
return bar
|
||||
|
||||
def make_progress_bar(bar, count, num_requests):
|
||||
stride1 = len("\u2589")
|
||||
stride2 = len("\u23F1")
|
||||
for i in range(num_requests):
|
||||
if i == count:
|
||||
print(f"Bar position {i} is {bar[i]}")
|
||||
bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
|
||||
print(f"Bar is now {bar}")
|
||||
return bar
|
||||
|
||||
def send_request(q, question, event, count, num_requests):
|
||||
|
||||
global bar
|
||||
|
||||
data = {'prompt': question}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
if response.status_code in [200,300]:
|
||||
print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
|
||||
print(f"Status Code for {question}: {response.status_code}")
|
||||
print(f"Response to {question}:\n")
|
||||
print_dict(json.loads(response.text))
|
||||
# put the response text in the queue
|
||||
q.put(response.text)
|
||||
if not q.empty():
|
||||
print(f"Completed task {count} / {num_requests}\n")
|
||||
bar = make_progress_bar(bar, count, num_requests)
|
||||
q.task_done()
|
||||
elif response.status_code == 429 and not q.empty():
|
||||
event.set()
|
||||
print("Server return too many requests; back off!! Reset event.")
|
||||
except Exception as e:
|
||||
print(f"Server returned exception error {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
global bar
|
||||
|
||||
url = "http://localhost:8080/completion"
|
||||
|
||||
num_requests = 40
|
||||
q = Queue(maxsize = 40)
|
||||
threads = []
|
||||
|
||||
bar = make_empty_bar(num_requests)
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'application/json',
|
||||
'User-Agent': 'Llamaserver.py'
|
||||
}
|
||||
|
||||
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
|
||||
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
|
||||
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
|
||||
"Bulgaria", "Romania", "Finland", "Sweden", "Norway", "Denmark", "Tanzania", "Israel",
|
||||
"Latvia", "Lithuania", "Estonia", "Pakistan", "Sri Lanka", "Malawi", "Mozambique"]
|
||||
|
||||
for i in range(num_requests):
|
||||
country = country_list[i % len(country_list)]
|
||||
question = f"When was the first democratic election (if any) in {country}?"
|
||||
# NOTE: don't pass the parameter as a function call; pass in args
|
||||
print(f"Processing request {i} / {num_requests}: {question}\n")
|
||||
event = threading.Event()
|
||||
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
|
||||
t.start()
|
||||
threads.append(t)
|
||||
# input("Any key",)
|
||||
|
||||
for thread in threads:
|
||||
thread.join() # wait for all threads to finish
|
||||
|
||||
print("FINISHED AND GETTING RESULTS")
|
||||
while not q.empty():
|
||||
text = q.get()
|
||||
print_dict(json.loads(text))
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -17,6 +17,7 @@ else()
|
|||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(beam-search)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(cmap-example)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(finetune)
|
||||
|
|
9
examples/cmap-example/CMakeLists.txt
Normal file
9
examples/cmap-example/CMakeLists.txt
Normal file
|
@ -0,0 +1,9 @@
|
|||
set(TARGET kvcacheviz)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} kvcacheviz.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
29
examples/cmap-example/KVcacheViz.py
Normal file
29
examples/cmap-example/KVcacheViz.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
# A simple illustration of how to represent cache occupancy
|
||||
# graphically using unicvode blocks
|
||||
# which are generated using print("\u2588"), print("\u2591")
|
||||
|
||||
from time import sleep
|
||||
import random
|
||||
|
||||
CACHE_SIZE = 50
|
||||
used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
|
||||
|
||||
def visualize_kv_cache(used_blocks, total_size):
|
||||
cache_viz = "["
|
||||
tot_used = 0
|
||||
for i in range(len(used_blocks)):
|
||||
# cache_viz += "█" * used_blocks[i]
|
||||
cache_viz += "\u2589" * used_blocks[i]
|
||||
cache_viz += "░" * (total_size - used_blocks[i])
|
||||
cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
|
||||
tot_used += used_blocks[i]
|
||||
|
||||
#print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
|
||||
|
||||
print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
|
||||
|
||||
|
||||
while True:
|
||||
visualize_kv_cache(used_blocks, CACHE_SIZE)
|
||||
sleep(0.5)
|
||||
used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks
|
68
examples/cmap-example/kvcacheviz.cpp
Normal file
68
examples/cmap-example/kvcacheviz.cpp
Normal file
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
A utility to represent the kv-cache occupancy graphically
|
||||
Takes as parameters
|
||||
- total cache size (-c)
|
||||
- number of simultaneous accesses/slots (-np)
|
||||
- a parameter related to the display context (max window width - data display requirements)
|
||||
It then uses a trick borrowed from tqdm to display occupancy
|
||||
TODO: Show contiguous space and block availability
|
||||
*/
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <cstdlib> // for rand()
|
||||
|
||||
static void show_kvcache(
|
||||
std::vector<int> used_blocks,
|
||||
int cache_size,
|
||||
int max_length
|
||||
) {
|
||||
int num_blocks = used_blocks.size();
|
||||
int slot_cache_size = cache_size / num_blocks;
|
||||
|
||||
while(true) {
|
||||
|
||||
// Print visualization after erasing the current line
|
||||
for(int i=0; i<num_blocks; i++) {
|
||||
for(int j=0; j<max_length; j++) {
|
||||
if(j<used_blocks[i] * max_length / slot_cache_size) {
|
||||
std::cout << "\033[94m█\033[0m";
|
||||
}
|
||||
//else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
|
||||
// std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
|
||||
//}
|
||||
else {
|
||||
std::cout << "\033[91m█\033[0m";
|
||||
}
|
||||
}
|
||||
std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
|
||||
}
|
||||
std::cout << "{";
|
||||
std::string upcursor = "\033[K\033[A\033[K";
|
||||
|
||||
for(int i=0; i < num_blocks; i++){
|
||||
//std::cout << used_blocks[i] << " ";
|
||||
upcursor += "\033[A\033[K";
|
||||
}
|
||||
|
||||
// Remove first element
|
||||
used_blocks.erase(used_blocks.begin());
|
||||
|
||||
// Add new random block at the end
|
||||
u_int new_block = rand() % slot_cache_size;
|
||||
used_blocks.push_back(new_block);
|
||||
|
||||
// Adjust the cursor so that the display overwrites itself
|
||||
upcursor += "\033[A\033[K";
|
||||
std::cout << "}" << std::endl;
|
||||
std::cin.get();
|
||||
std::cout << upcursor;
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
|
||||
int cache_size = 65536;
|
||||
int max_length = 128;
|
||||
show_kvcache(used_blocks, cache_size, max_length);
|
||||
}
|
17
examples/main/Adjustmain.py
Normal file
17
examples/main/Adjustmain.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Adjust the main.cpp file
|
||||
# to include the (Tokens used:) data output
|
||||
|
||||
try:
|
||||
with open("/Users/edsilm2/llama.cpp/examples/main/main.cpp", 'r+') as file:
|
||||
main = file.read()
|
||||
search_str = 'printf("\\n> ");'
|
||||
new_str = 'printf("\\033[31m(Tokens used: %d / %d)\\033[0m\\nJCP: ", n_past, n_ctx);'
|
||||
main = main.replace(search_str, new_str)
|
||||
file.seek(0)
|
||||
search_str = 'context full and n_predict == -%d => stopping'
|
||||
new_str = 'context full and n_predict == %d => stopping'
|
||||
main = main.replace(search_str, new_str)
|
||||
file.seek(0)
|
||||
file.write(main)
|
||||
except FileNotFoundError as fe:
|
||||
print(f"Error searching for main.cpp: {fe}")
|
|
@ -522,7 +522,7 @@ int main(int argc, char ** argv) {
|
|||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
LOG_TEE("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
|
|||
LOG("waiting for user input\n");
|
||||
|
||||
if (params.instruct || params.chatml) {
|
||||
printf("\n> ");
|
||||
printf("\033[31m(Tokens used: %d / %d)\033[0m\nJCP: ", n_past, n_ctx);
|
||||
}
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
|
||||
#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
|
||||
#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 // originally 5
|
||||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
|
||||
|
@ -110,7 +110,7 @@
|
|||
#endif
|
||||
|
||||
#ifndef CPPHTTPLIB_LISTEN_BACKLOG
|
||||
#define CPPHTTPLIB_LISTEN_BACKLOG 5
|
||||
#define CPPHTTPLIB_LISTEN_BACKLOG 15 // originally 5
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue