From 74d13ef3358e6e2407650e3890ff592fc49c3f87 Mon Sep 17 00:00:00 2001 From: pudepiedj Date: Mon, 26 Feb 2024 12:09:06 +0000 Subject: [PATCH] Server updates --- Llamaserver.py | 5 +- apikeys.txt | 3 + cmakescript.sh | 2 + examples/server/CMakeLists.txt | 1 + examples/server/server.cpp | 109 +++++++++++++++++++-------------- examples/server/utils.hpp | 24 +++----- 6 files changed, 81 insertions(+), 63 deletions(-) create mode 100644 apikeys.txt diff --git a/Llamaserver.py b/Llamaserver.py index 3e02d7721..0342a22d0 100644 --- a/Llamaserver.py +++ b/Llamaserver.py @@ -104,10 +104,13 @@ if __name__ == "__main__": bar = make_empty_bar(num_requests) + api_key = input("What is your API key? ",) + headers = { 'Content-Type': 'application/json', 'Accept': 'application/json', - 'User-Agent': 'Llamaserver.py' + 'User-Agent': 'Llamaserver.py', + 'Authorization': f'Bearer {api_key}' } country_list = ["France", "Germany", "China", "USA", "Italy", "India", diff --git a/apikeys.txt b/apikeys.txt new file mode 100644 index 000000000..441abe9f3 --- /dev/null +++ b/apikeys.txt @@ -0,0 +1,3 @@ +john123456 +susan987654 +guestabcdef diff --git a/cmakescript.sh b/cmakescript.sh index 6d77f9446..70e4eece4 100755 --- a/cmakescript.sh +++ b/cmakescript.sh @@ -11,3 +11,5 @@ else echo "Usage: $0 (Debug|Release)" exit 1 fi + +cd build diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index cc13b2d63..965a3cb68 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,7 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address") add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e935b8447..fc4f36e93 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -37,7 +37,7 @@ using json = nlohmann::json; struct server_params { - std::string hostname = "0.0.0.0"; // 127.0.0.1 restricts to localhost only; use 0.0.0.0 for local network. + std::string hostname = "127.0.0.1"; // --host switches to use 0.0.0.0 for public network. std::vector api_keys; std::string public_path = "examples/server/public"; std::string chat_template = ""; @@ -307,12 +307,12 @@ struct llama_client_slot if (flag) { printf("\033[5;0H"); // needs to be sensitive to the number of slots }; - LOG_TEE("Finished processing slot %d.\n", slot.id); - LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + LOG("Finished processing slot %d.\n", slot.id); + LOG("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LOG("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); + LOG("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); if (flag) { printf("\033[KPress any key ... \n"); @@ -347,7 +347,7 @@ static void kvgraphics(std::vector& slots) { // See eblow for a rethink because controlling log printing is such a pain in C++11 // Only clear the screen the first time round if (cls_flag) { - // printf("\033[2J"); + printf("\033[2J"); cls_flag = false; } printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n"); @@ -355,7 +355,7 @@ static void kvgraphics(std::vector& slots) { // we can know and control how many lines of output we are printing so just start below that and fix the graphics location printf("\033[%d;0H", 10); for(int i=0; i= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_TEE(" -> Slot %2i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); + LOG(" -> Slot %2i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); } slot.ga_i = 0; @@ -588,13 +588,13 @@ struct llama_server_context printf("\033[5;0H"); if (slot.id == -1 && slot.available()) { - LOG_TEE("Unallocated task now using slot %d", slot.id); + LOG("Unallocated task now using slot %d", slot.id); return &slot; } if (slot.id == id && slot.available()) { - LOG_TEE("Using id-based available slot called by id: %d\n", slot.id); + LOG("Using id-based available slot called by id: %d\n", slot.id); return &slot; } @@ -816,10 +816,10 @@ struct llama_server_context img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); + LOG("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); return false; } - LOG_TEE("slot %i - loaded image\n", slot->id); + LOG("slot %i - loaded image\n", slot->id); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -851,12 +851,12 @@ struct llama_server_context } } if (!found) { - LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id); + LOG("ERROR: Image with id: %i, not found.\n", img_id); slot->images.clear(); return false; } } catch (const std::invalid_argument& e) { - LOG_TEE("Invalid image number id in prompt\n"); + LOG("Invalid image number id in prompt\n"); slot->images.clear(); return false; } @@ -879,7 +879,7 @@ struct llama_server_context all_slots_are_idle = false; - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + LOG("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); return true; } @@ -906,7 +906,7 @@ struct llama_server_context if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG("%s: llama_decode() failed\n", __func__); return; } @@ -917,7 +917,7 @@ struct llama_server_context } } - LOG_TEE("system prompt updated\n"); + LOG("system prompt updated\n"); system_need_update = false; } @@ -1097,7 +1097,7 @@ struct llama_server_context } if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { - LOG_TEE("Error processing the given image"); + LOG("Error processing the given image"); return false; } @@ -1110,7 +1110,7 @@ struct llama_server_context void send_error(task_server& task, const std::string &error) { - LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); + LOG("task %i - error: %s\n", task.id, error.c_str()); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1353,7 +1353,7 @@ struct llama_server_context }; if (llama_decode(ctx, batch_view)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG("%s : failed to eval\n", __func__); return false; } } @@ -1371,7 +1371,7 @@ struct llama_server_context llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; if (llama_decode(ctx, batch_img)) { - LOG_TEE("%s : failed to eval image\n", __func__); + LOG("%s : failed to eval image\n", __func__); return false; } slot.n_past += n_eval; @@ -1444,19 +1444,19 @@ struct llama_server_context // because if it doesnt the returned value will be -1; what makes it anything else? int requested_slot = json_value(task.data, "slot_id", -1); printf("\033[5;0H\033[K"); - LOG_TEE("Task %d requesting slot %d\n", task.id, requested_slot); + LOG("Task %d requesting slot %d\n", task.id, requested_slot); // why are we suddenly using 'slot' as a pointer here - confusing? llama_client_slot *slot = get_slot(requested_slot); // returns nullptr if no slot available if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_TEE("no slot is available for task %d\n", task.id); + LOG("no slot is available for task %d\n", task.id); queue_tasks.defer(task); break; } else { printf("\033[5;0H\033[K"); - LOG_TEE("Activating slot %d.\n", (*slot).id); + LOG("Activating slot %d.\n", (*slot).id); } if (task.data.contains("system_prompt")) @@ -1528,7 +1528,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_TEE("updating system prompt\n"); + LOG("updating system prompt\n"); update_system_prompt(); } @@ -1538,7 +1538,7 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); + LOG("all slots are idle and system prompt is empty; clearing the KV cache\n"); kv_cache_clear(); } return true; @@ -1562,7 +1562,7 @@ struct llama_server_context const int n_discard = n_left / 2; // is this arbitrary? printf("\033[5;0H\033[K"); - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); + LOG("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1597,7 +1597,7 @@ struct llama_server_context slot.t_last_used = ggml_time_us(); printf("\033[6;0H\033[K"); - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + LOG("slot %d released (%d tokens remain in cache)\n", slot.id, (int) slot.cache_tokens.size()); queue_tasks.notify_slot_changed(); // why don't we immediately reallocate the released slot without waiting? Is this what -cb does? continue; @@ -1746,7 +1746,7 @@ struct llama_server_context } printf("\033[7;0H\033[K"); - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); } slot.cache_tokens = prompt_tokens; @@ -1754,7 +1754,7 @@ struct llama_server_context if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); + LOG("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); slot.n_past--; if (slot.ga_i > 0) { @@ -1763,7 +1763,7 @@ struct llama_server_context } printf("\033[5;0H\033[K"); - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); + LOG("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); @@ -1800,7 +1800,7 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); + LOG("failed processing images\n"); return false; } @@ -1837,10 +1837,10 @@ struct llama_server_context const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + LOG("\n"); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); @@ -1850,7 +1850,7 @@ struct llama_server_context slot.ga_i += slot.ga_w / slot.ga_n; - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); } slot.n_past_se += n_tokens; } @@ -1877,12 +1877,12 @@ struct llama_server_context if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); return false; } // we get here if ret = 1 and n_batch != 1 - LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); + LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); // retry with half the batch size to try to find a free slot in the KV cache n_batch /= 2; @@ -2823,11 +2823,14 @@ int main(int argc, char **argv) log_data["hostname"] = sparams.hostname; log_data["port"] = std::to_string(sparams.port); - if (sparams.api_keys.size() == 1) { + if (sparams.api_keys.size() == 1) { // what happens if the size is zero? log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4); } else if (sparams.api_keys.size() > 1) { log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } + for (int i=0; i bool { - // If API key is not set, skip validation + // If API key is not set, because the file is empty, skip validation if (sparams.api_keys.empty()) { return true; } @@ -2866,9 +2869,21 @@ int main(int argc, char **argv) std::string prefix = "Bearer "; if (auth_header.substr(0, prefix.size()) == prefix) { std::string received_api_key = auth_header.substr(prefix.size()); - if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) { - return true; // API key is valid + LOG("Received API key = %s\n", received_api_key.c_str()); + for (int i = 0; i < int(sparams.api_keys.size()); i++) { + // for some reason the file apikeys are one character longer than those passed from Bearer so we shorten them + std::string uncut_api = sparams.api_keys[i]; // store original apikey + std::string cut_api = uncut_api.substr(0, uncut_api.size() - 1); // do not shorten in-place by using erase + if (received_api_key != cut_api) { + LOG("%s != %s and length left = %zu, length right = %zu\n", received_api_key.c_str(), cut_api.c_str(),received_api_key.size(), cut_api.size()); + } else if (received_api_key == cut_api) { + LOG("%s = %s FOUND IT!!!\n", received_api_key.c_str(), cut_api.c_str()); + return true; + } } + //if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) { + // return true; // API key is valid + //} } // API key is invalid or not provided @@ -2928,14 +2943,14 @@ int main(int argc, char **argv) } // it appears that here we first get ONE request to parse; then TEN; then ONE-by-ONE printf("\033[5;0H\033[K"); - LOG_TEE("Request body to parse: %s", req.body.c_str()); + LOG_TEE("Request body to parse: %s.\n", req.body.c_str()); if (llama.skvinteract) { getchar(); } json data = json::parse(req.body); const int task_id = llama.queue_tasks.get_new_id(); // just returns a new id number llama.queue_results.add_waiting_task_id(task_id); - LOG_TEE("Initiated new task %d.\n", task_id); + LOG("Initiated new task %d.\n", task_id); llama.request_completion(task_id, data, false, false, -1); if (!json_value(data, "stream", false)) { std::string completion_text; // is this ever used? diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 1aa45f28c..84822c1fd 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -151,7 +151,7 @@ static inline void server_log(const char *level, const char *function, int line, } const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); + LOG("%.*s\n", (int)str.size(), str.data()); fflush(stdout); } @@ -236,7 +236,7 @@ struct llama_server_queue { task.id = id; // originally id++ but this repeats get_new_id below } queue_tasks.push_back(std::move(task)); - //LOG_TEE("Queue now has %2zu members.\n", queue_tasks.size()); + //LOG("Queue now has %2zu members.\n", queue_tasks.size()); condition_tasks.notify_one(); return task.id; } @@ -245,15 +245,13 @@ struct llama_server_queue { void defer(task_server task) { std::unique_lock lock(mutex_tasks); queue_tasks_deferred.push_back(std::move(task)); - printf("\033[1;50H*** "); - LOG_TEE("Deferred queue now has %3zu members.\n", queue_tasks_deferred.size()); - printf("\033[5;0H"); + LOG("Deferred queue now has %3zu members.\n", queue_tasks_deferred.size()); } // Get the next id for creating a new task int get_new_id() { std::unique_lock lock(mutex_tasks); - LOG_TEE("New task id returned with value %d.\n", id); + LOG("New task id returned with value %d.\n", id); return id++; } @@ -297,10 +295,10 @@ struct llama_server_queue { // Start the main loop. Called from the very end of server.cpp void start_loop() { running = true; - //LOG_TEE("In start_loop have new task number %d.\n", id); + //LOG("In start_loop have new task number %d.\n", id); while (true) { // new task arrived - // LOG_TEE("In start_loop have new task number %d.\n", id); + // LOG("In start_loop have new task number %d.\n", id); { while (true) { @@ -393,17 +391,13 @@ struct llama_server_response { void add_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); - printf("\033[1;50H*** "); - LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size()); - printf("\033[5;0H"); + LOG("Waiting task list size after addition: %2zu.\n", waiting_task_ids.size()); } void remove_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); - printf("\033[2;50H*** "); - LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size()); - printf("\033[5;0H"); + LOG("Waiting task list size after removal: %zu.\n", waiting_task_ids.size()); } // This function blocks the thread until there is a response for this task_id @@ -441,7 +435,7 @@ struct llama_server_response { std::unique_lock lock(mutex_results); LOG_VERBOSE("send new result", {}); for (auto& task_id : waiting_task_ids) { - // LOG_TEE("waiting task id %i \n", task_id); + // LOG("waiting task id %i \n", task_id); // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result if (result.multitask_id == task_id) {