diff --git a/Llamaserver.py b/Llamaserver.py index ce48aeae2..3e02d7721 100644 --- a/Llamaserver.py +++ b/Llamaserver.py @@ -52,8 +52,12 @@ def send_request(q, question, event, count, num_requests): delay = 0.1 global bar - - data = {'prompt': question} + + system = "You are a helpful assistant who answers all requests \ +courteously and accurately without undue repetion. \ +you pay close attention to the nuance of a question and response accordingly." + + data = {'system': system, 'prompt': question} try: response = requests.post(url, headers=headers, json=data) @@ -114,7 +118,7 @@ if __name__ == "__main__": for i in range(num_requests): country = country_list[i % len(country_list)] - question = f"What was the total population of {country} in 2018?" + question = f"Tell me the political history of {country} up to 2018." # NOTE: don't pass the parameter as a function call; pass in args print(f"Processing request {i} / {num_requests}: {question}\n") event = threading.Event() diff --git a/cmakescript.sh b/cmakescript.sh new file mode 100755 index 000000000..6d77f9446 --- /dev/null +++ b/cmakescript.sh @@ -0,0 +1,13 @@ +# simple automation of cmake +# usage is ./cmakescript.sh Debug || Release + +rm -r build +cmake -B build +cd build + +if [ $# -eq 1 ] && [[ "$1" == "Debug" || "$1" == "Release" ]]; then + cmake --build . --config "$1" +else + echo "Usage: $0 (Debug|Release)" + exit 1 +fi diff --git a/examples/cmap-example/CMakeLists.txt b/examples/cmap-example/CMakeLists.txt index d62ca26cf..83344b42a 100644 --- a/examples/cmap-example/CMakeLists.txt +++ b/examples/cmap-example/CMakeLists.txt @@ -1,8 +1,8 @@ -set(TARGET TCPshellscript) +set(TARGET TCPportquiz) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} TCPshellscript.cpp) +add_executable(${TARGET} TCPportquiz.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/examples/cmap-example/TCPshellscript.cpp b/examples/cmap-example/TCPportquiz.cpp similarity index 100% rename from examples/cmap-example/TCPshellscript.cpp rename to examples/cmap-example/TCPportquiz.cpp diff --git a/examples/cmap-example/cursor.cpp b/examples/cmap-example/cursor.cpp new file mode 100644 index 000000000..92acb6b69 --- /dev/null +++ b/examples/cmap-example/cursor.cpp @@ -0,0 +1,26 @@ +// just trying to get the cursor position + +#include + +struct CursorPos { + int x; + int y; +}; + +static CursorPos getCursorPos() { + + // Get text cursor position + auto cursorPos = getCursorPos(); + + // Assign to struct + CursorPos pos; + pos.x = cursorPos.x; + pos.y = cursorPos.y; + + return pos; +} + +int main() { + CursorPos cursor = getCursorPos(); + printf("The x co-ordinate of the cursor is %zu\n; the y co-ordinate of the cursor is %zu\n", cursor.x, cursor.y); +} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9cb16dbb6..e935b8447 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -305,7 +305,7 @@ struct llama_client_slot void print_timings(llama_client_slot &slot, bool flag = false) const { if (flag) { - printf("\033[21;0H"); // needs to be sensitive to the number of slots + printf("\033[5;0H"); // needs to be sensitive to the number of slots }; LOG_TEE("Finished processing slot %d.\n", slot.id); LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", @@ -315,7 +315,7 @@ struct llama_client_slot LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); if (flag) { - printf("\033[25;0HPress any key ... "); + printf("\033[KPress any key ... \n"); getchar(); } } @@ -323,11 +323,11 @@ struct llama_client_slot // experimental/diagostic graphic to show kvcache status // requires just `slots` and `params.n_ctx` as parameters -static void kvgraphics(std::vector& slots, int cache_size) { +static void kvgraphics(std::vector& slots) { int max_length = 128; int num_blocks = slots.size(); - size_t slot_cache_size = cache_size / num_blocks; + size_t slot_cache_size = slots[0].n_ctx; bool cls_flag = true; // this flag only prevents repeated cls inside one call std::string slot_symbol1 = ""; std::string slot_symbol2 = ""; @@ -344,13 +344,16 @@ static void kvgraphics(std::vector& slots, int cache_size) { // Print visualization // Always start at the top left of the window (H means 'move cursor to this position'; 2J = cls) + // See eblow for a rethink because controlling log printing is such a pain in C++11 // Only clear the screen the first time round if (cls_flag) { - printf("\033[2J"); + // printf("\033[2J"); cls_flag = false; } printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n"); + // we can know and control how many lines of output we are printing so just start below that and fix the graphics location + printf("\033[%d;0H", 10); for(int i=0; i& slots, int cache_size) { } printf(" %4zu/%5zu %2d %s %s %s\n", slots[i].cache_tokens.size(), slot_cache_size, slots[i].id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str()); } - printf("\n\033[%dJ", num_blocks+5); // move cursor to end of cache display + printf("\033[5;0H"); // just start two lines below the heading + //printf("\n\033[%d;0H\033[%dJ", 10, num_blocks+5); // move cursor to end of cache display and clear thereafter } struct llama_server_context @@ -494,7 +498,7 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + LOG_TEE(" -> Slot %2i - max context: %i\n", slot.id, n_ctx_slot); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -504,7 +508,7 @@ struct llama_server_context GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); + LOG_TEE(" -> Slot %2i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); } slot.ga_i = 0; @@ -581,6 +585,7 @@ struct llama_server_context for (llama_client_slot & slot : slots) { + printf("\033[5;0H"); if (slot.id == -1 && slot.available()) { LOG_TEE("Unallocated task now using slot %d", slot.id); @@ -1438,6 +1443,7 @@ struct llama_server_context // why should task.data already contain a slot_id key when we haven't allocated it? // because if it doesnt the returned value will be -1; what makes it anything else? int requested_slot = json_value(task.data, "slot_id", -1); + printf("\033[5;0H\033[K"); LOG_TEE("Task %d requesting slot %d\n", task.id, requested_slot); // why are we suddenly using 'slot' as a pointer here - confusing? @@ -1449,6 +1455,7 @@ struct llama_server_context queue_tasks.defer(task); break; } else { + printf("\033[5;0H\033[K"); LOG_TEE("Activating slot %d.\n", (*slot).id); } @@ -1554,6 +1561,7 @@ struct llama_server_context const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1; const int n_discard = n_left / 2; // is this arbitrary? + printf("\033[5;0H\033[K"); LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1588,6 +1596,7 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); + printf("\033[6;0H\033[K"); LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); queue_tasks.notify_slot_changed(); // why don't we immediately reallocate the released slot without waiting? Is this what -cb does? @@ -1736,6 +1745,7 @@ struct llama_server_context slot.ga_i = ga_i; } + printf("\033[7;0H\033[K"); LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); } @@ -1752,6 +1762,7 @@ struct llama_server_context } } + printf("\033[5;0H\033[K"); LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); @@ -1936,7 +1947,7 @@ struct llama_server_context // we are still inside llama_server_context so we can use an unqualified parameter if (skvgraphics) { - kvgraphics(slots, params.n_ctx); + kvgraphics(slots); } return true; @@ -2916,7 +2927,8 @@ int main(int argc, char **argv) return; } // it appears that here we first get ONE request to parse; then TEN; then ONE-by-ONE - printf("Request body to parse: %s", req.body.c_str()); + printf("\033[5;0H\033[K"); + LOG_TEE("Request body to parse: %s", req.body.c_str()); if (llama.skvinteract) { getchar(); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 28dbfe970..1aa45f28c 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -36,6 +36,7 @@ extern bool server_verbose; #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + // // parallel // @@ -244,7 +245,9 @@ struct llama_server_queue { void defer(task_server task) { std::unique_lock lock(mutex_tasks); queue_tasks_deferred.push_back(std::move(task)); - LOG_TEE("Deferred task queue now has %3zu members.\n", queue_tasks_deferred.size()); + printf("\033[1;50H*** "); + LOG_TEE("Deferred queue now has %3zu members.\n", queue_tasks_deferred.size()); + printf("\033[5;0H"); } // Get the next id for creating a new task @@ -390,15 +393,17 @@ struct llama_server_response { void add_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); - printf("\033[21;0H"); + printf("\033[1;50H*** "); LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size()); + printf("\033[5;0H"); } void remove_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); - printf("\033[21;0H"); + printf("\033[2;50H*** "); LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size()); + printf("\033[5;0H"); } // This function blocks the thread until there is a response for this task_id