Major changes to server plus kvcache graphic
This commit is contained in:
parent
326b418b59
commit
5249985578
9 changed files with 1033 additions and 532 deletions
110
Llamaserver.py
Normal file
110
Llamaserver.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
from queue import Queue
|
||||||
|
import threading
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
def print_dict(data):
|
||||||
|
for k, v in data.items():
|
||||||
|
if isinstance(v, dict):
|
||||||
|
print_dict(v)
|
||||||
|
elif isinstance(v, list):
|
||||||
|
for entry in v:
|
||||||
|
print_dict(entry)
|
||||||
|
elif k == "content":
|
||||||
|
print(f"Key: {k:>30}: {v}")
|
||||||
|
return
|
||||||
|
|
||||||
|
def print_response(text):
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
def make_empty_bar(num_requests):
|
||||||
|
bar = []
|
||||||
|
for i in range(num_requests):
|
||||||
|
bar.append("\u2589")
|
||||||
|
bar = ' '.join(bar)
|
||||||
|
bar = bar.replace(' ','')
|
||||||
|
print(f"Bar is now {bar}.")
|
||||||
|
return bar
|
||||||
|
|
||||||
|
def make_progress_bar(bar, count, num_requests):
|
||||||
|
stride1 = len("\u2589")
|
||||||
|
stride2 = len("\u23F1")
|
||||||
|
for i in range(num_requests):
|
||||||
|
if i == count:
|
||||||
|
print(f"Bar position {i} is {bar[i]}")
|
||||||
|
bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
|
||||||
|
print(f"Bar is now {bar}")
|
||||||
|
return bar
|
||||||
|
|
||||||
|
def send_request(q, question, event, count, num_requests):
|
||||||
|
|
||||||
|
global bar
|
||||||
|
|
||||||
|
data = {'prompt': question}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(url, headers=headers, json=data)
|
||||||
|
if response.status_code in [200,300]:
|
||||||
|
print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
|
||||||
|
print(f"Status Code for {question}: {response.status_code}")
|
||||||
|
print(f"Response to {question}:\n")
|
||||||
|
print_dict(json.loads(response.text))
|
||||||
|
# put the response text in the queue
|
||||||
|
q.put(response.text)
|
||||||
|
if not q.empty():
|
||||||
|
print(f"Completed task {count} / {num_requests}\n")
|
||||||
|
bar = make_progress_bar(bar, count, num_requests)
|
||||||
|
q.task_done()
|
||||||
|
elif response.status_code == 429 and not q.empty():
|
||||||
|
event.set()
|
||||||
|
print("Server return too many requests; back off!! Reset event.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Server returned exception error {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
global bar
|
||||||
|
|
||||||
|
url = "http://localhost:8080/completion"
|
||||||
|
|
||||||
|
num_requests = 40
|
||||||
|
q = Queue(maxsize = 40)
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
bar = make_empty_bar(num_requests)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Accept': 'application/json',
|
||||||
|
'User-Agent': 'Llamaserver.py'
|
||||||
|
}
|
||||||
|
|
||||||
|
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
|
||||||
|
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
|
||||||
|
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
|
||||||
|
"Bulgaria", "Romania", "Finland", "Sweden", "Norway", "Denmark", "Tanzania", "Israel",
|
||||||
|
"Latvia", "Lithuania", "Estonia", "Pakistan", "Sri Lanka", "Malawi", "Mozambique"]
|
||||||
|
|
||||||
|
for i in range(num_requests):
|
||||||
|
country = country_list[i % len(country_list)]
|
||||||
|
question = f"When was the first democratic election (if any) in {country}?"
|
||||||
|
# NOTE: don't pass the parameter as a function call; pass in args
|
||||||
|
print(f"Processing request {i} / {num_requests}: {question}\n")
|
||||||
|
event = threading.Event()
|
||||||
|
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
|
||||||
|
t.start()
|
||||||
|
threads.append(t)
|
||||||
|
# input("Any key",)
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
thread.join() # wait for all threads to finish
|
||||||
|
|
||||||
|
print("FINISHED AND GETTING RESULTS")
|
||||||
|
while not q.empty():
|
||||||
|
text = q.get()
|
||||||
|
print_dict(json.loads(text))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ else()
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(beam-search)
|
add_subdirectory(beam-search)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
|
add_subdirectory(cmap-example)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
|
|
9
examples/cmap-example/CMakeLists.txt
Normal file
9
examples/cmap-example/CMakeLists.txt
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
set(TARGET kvcacheviz)
|
||||||
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
add_executable(${TARGET} kvcacheviz.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
if (WIN32)
|
||||||
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
|
endif()
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
29
examples/cmap-example/KVcacheViz.py
Normal file
29
examples/cmap-example/KVcacheViz.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# A simple illustration of how to represent cache occupancy
|
||||||
|
# graphically using unicvode blocks
|
||||||
|
# which are generated using print("\u2588"), print("\u2591")
|
||||||
|
|
||||||
|
from time import sleep
|
||||||
|
import random
|
||||||
|
|
||||||
|
CACHE_SIZE = 50
|
||||||
|
used_blocks = [5, 3, 2, 1, 10, 2, 6, 4, 7, 10]
|
||||||
|
|
||||||
|
def visualize_kv_cache(used_blocks, total_size):
|
||||||
|
cache_viz = "["
|
||||||
|
tot_used = 0
|
||||||
|
for i in range(len(used_blocks)):
|
||||||
|
# cache_viz += "█" * used_blocks[i]
|
||||||
|
cache_viz += "\u2589" * used_blocks[i]
|
||||||
|
cache_viz += "░" * (total_size - used_blocks[i])
|
||||||
|
cache_viz += f"{used_blocks[i]:3.0f}/{total_size}]\r["
|
||||||
|
tot_used += used_blocks[i]
|
||||||
|
|
||||||
|
#print(f"\r[{cache_viz}] {used_blocks[i]:2.0f}/{total_size}", end="")
|
||||||
|
|
||||||
|
print(f"\r{cache_viz}] {tot_used}/{len(used_blocks) * total_size}", end="")
|
||||||
|
|
||||||
|
|
||||||
|
while True:
|
||||||
|
visualize_kv_cache(used_blocks, CACHE_SIZE)
|
||||||
|
sleep(0.5)
|
||||||
|
used_blocks = used_blocks[1:] + [random.randint(0,50)] # update used blocks
|
68
examples/cmap-example/kvcacheviz.cpp
Normal file
68
examples/cmap-example/kvcacheviz.cpp
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
/*
|
||||||
|
A utility to represent the kv-cache occupancy graphically
|
||||||
|
Takes as parameters
|
||||||
|
- total cache size (-c)
|
||||||
|
- number of simultaneous accesses/slots (-np)
|
||||||
|
- a parameter related to the display context (max window width - data display requirements)
|
||||||
|
It then uses a trick borrowed from tqdm to display occupancy
|
||||||
|
TODO: Show contiguous space and block availability
|
||||||
|
*/
|
||||||
|
#include <iostream>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdlib> // for rand()
|
||||||
|
|
||||||
|
static void show_kvcache(
|
||||||
|
std::vector<int> used_blocks,
|
||||||
|
int cache_size,
|
||||||
|
int max_length
|
||||||
|
) {
|
||||||
|
int num_blocks = used_blocks.size();
|
||||||
|
int slot_cache_size = cache_size / num_blocks;
|
||||||
|
|
||||||
|
while(true) {
|
||||||
|
|
||||||
|
// Print visualization after erasing the current line
|
||||||
|
for(int i=0; i<num_blocks; i++) {
|
||||||
|
for(int j=0; j<max_length; j++) {
|
||||||
|
if(j<used_blocks[i] * max_length / slot_cache_size) {
|
||||||
|
std::cout << "\033[94m█\033[0m";
|
||||||
|
}
|
||||||
|
//else if ((j == int(used_blocks[i] * max_length / slot_cache_size + 0.5)) && (j > 7 * max_length / slot_cache_size + 0.5)) {
|
||||||
|
// std::cout << "\033[D\033[D\033[D\033[D" << std::setw(3) << used_blocks[i] << "\033[C";
|
||||||
|
//}
|
||||||
|
else {
|
||||||
|
std::cout << "\033[91m█\033[0m";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << " " << std::setw(5) << used_blocks[i] << "/" << std::setw(5) << slot_cache_size << std::endl;
|
||||||
|
}
|
||||||
|
std::cout << "{";
|
||||||
|
std::string upcursor = "\033[K\033[A\033[K";
|
||||||
|
|
||||||
|
for(int i=0; i < num_blocks; i++){
|
||||||
|
//std::cout << used_blocks[i] << " ";
|
||||||
|
upcursor += "\033[A\033[K";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove first element
|
||||||
|
used_blocks.erase(used_blocks.begin());
|
||||||
|
|
||||||
|
// Add new random block at the end
|
||||||
|
u_int new_block = rand() % slot_cache_size;
|
||||||
|
used_blocks.push_back(new_block);
|
||||||
|
|
||||||
|
// Adjust the cursor so that the display overwrites itself
|
||||||
|
upcursor += "\033[A\033[K";
|
||||||
|
std::cout << "}" << std::endl;
|
||||||
|
std::cin.get();
|
||||||
|
std::cout << upcursor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
std::vector<int> used_blocks = {64, 64, 64, 64, 64, 64, 64, 64, 64, 46, 46, 46, 46, 46, 46, 46, 46, 46};
|
||||||
|
int cache_size = 65536;
|
||||||
|
int max_length = 128;
|
||||||
|
show_kvcache(used_blocks, cache_size, max_length);
|
||||||
|
}
|
17
examples/main/Adjustmain.py
Normal file
17
examples/main/Adjustmain.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# Adjust the main.cpp file
|
||||||
|
# to include the (Tokens used:) data output
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open("/Users/edsilm2/llama.cpp/examples/main/main.cpp", 'r+') as file:
|
||||||
|
main = file.read()
|
||||||
|
search_str = 'printf("\\n> ");'
|
||||||
|
new_str = 'printf("\\033[31m(Tokens used: %d / %d)\\033[0m\\nJCP: ", n_past, n_ctx);'
|
||||||
|
main = main.replace(search_str, new_str)
|
||||||
|
file.seek(0)
|
||||||
|
search_str = 'context full and n_predict == -%d => stopping'
|
||||||
|
new_str = 'context full and n_predict == %d => stopping'
|
||||||
|
main = main.replace(search_str, new_str)
|
||||||
|
file.seek(0)
|
||||||
|
file.write(main)
|
||||||
|
except FileNotFoundError as fe:
|
||||||
|
print(f"Error searching for main.cpp: {fe}")
|
|
@ -522,7 +522,7 @@ int main(int argc, char ** argv) {
|
||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||||
if (params.n_predict == -2) {
|
if (params.n_predict == -2) {
|
||||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
LOG_TEE("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.instruct || params.chatml) {
|
if (params.instruct || params.chatml) {
|
||||||
printf("\n> ");
|
printf("\033[31m(Tokens used: %d / %d)\033[0m\nJCP: ", n_past, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
|
#ifndef CPPHTTPLIB_KEEPALIVE_MAX_COUNT
|
||||||
#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5
|
#define CPPHTTPLIB_KEEPALIVE_MAX_COUNT 5 // originally 5
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
|
#ifndef CPPHTTPLIB_CONNECTION_TIMEOUT_SECOND
|
||||||
|
@ -110,7 +110,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPPHTTPLIB_LISTEN_BACKLOG
|
#ifndef CPPHTTPLIB_LISTEN_BACKLOG
|
||||||
#define CPPHTTPLIB_LISTEN_BACKLOG 5
|
#define CPPHTTPLIB_LISTEN_BACKLOG 15 // originally 5
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue