From 5d61ae8d2a5b807332b2216fdf602c85e4276714 Mon Sep 17 00:00:00 2001 From: pudepiedj Date: Sat, 2 Mar 2024 10:24:07 +0000 Subject: [PATCH] Renaming some vars --- examples/server/server.cpp | 14 ++++++------ examples/server/utils.hpp | 47 ++------------------------------------ 2 files changed, 9 insertions(+), 52 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5b8d1a9e3..e37da6133 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -253,8 +253,8 @@ struct server_slot { std::string stdout_reset; std::string stderr_target; std::string stderr_reset; - double t_token = t_prompt_processing / num_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + double t_token = t_prompt_processing / n_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; //printf("\033[72;0H]"); sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", t_prompt_processing, n_prompt_tokens_processed, @@ -328,7 +328,7 @@ struct server_metrics { // experimental/diagostic graphic to show kvcache status // requires just `slots` and `params.n_ctx` as parameters -static void kvgraphics(std::vector& slots) { +static void kvgraphics(std::vector& slots) { int max_length = 144; int num_blocks = slots.size(); @@ -604,7 +604,7 @@ struct llama_server_context // the logic seems wrong in this function // why should there be an id in a task matching a slot.id before a slot has been assigned? // most commonly id = -1 so we deal with that first rather than the specified id > 0 - llama_client_slot* get_slot(int id) { + server_slot* get_slot(int id) { int64_t t_last = ggml_time_us(); server_slot *last_used = nullptr; @@ -1168,7 +1168,7 @@ struct llama_server_context queue_results.send(res); } - json get_formatted_generation(llama_client_slot &slot) + json get_formatted_generation(server_slot &slot) { const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && @@ -1270,7 +1270,7 @@ struct llama_server_context {"model", params.model_alias}, {"tokens_predicted", slot.n_decoded}, {"tokens_evaluated", slot.n_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, + {"generation_settings", get_formatted_generation(slot)}, {"prompt", slot.prompt}, {"truncated", slot.truncated}, {"stopped_eos", slot.stopped_eos}, @@ -1559,7 +1559,7 @@ struct llama_server_context int n_idle_slots = 0; int n_processing_slots = 0; - for (llama_client_slot &slot: slots) { + for (server_slot &slot: slots) { json slot_data = get_formatted_generation(slot); slot_data["id"] = slot.id; slot_data["task_id"] = slot.task_id; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index ad6e4ca4d..4c7ade096 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -137,48 +137,6 @@ struct task_multi { std::vector results{}; }; -// TODO: can become bool if we can't find use of more states; MAYBE there is a case for RESERVED to keep slots dedicated to chats? -enum slot_state -{ - IDLE, - PROCESSING, -}; - -enum slot_command -{ - NONE, - LOAD_PROMPT, - RELEASE, -}; - -struct slot_params -{ - bool stream = true; - bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct slot_image -{ - int32_t id; - - bool request_encode_image = false; - float * image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8 * img_data; - - std::string prefix_prompt; // before of this image -}; - // completion token output with probabilities struct completion_token_output { struct token_prob @@ -198,7 +156,6 @@ struct token_translator { std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } }; -static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { static inline void server_log( const char *level, const char *function, @@ -211,8 +168,8 @@ static inline void server_log( std::string stderr_target, std::string stdout_reset, std::string stderr_reset -) -{ + ) + { std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); json log = nlohmann::ordered_json{