Renaming some vars

This commit is contained in:
pudepiedj 2024-03-02 10:24:07 +00:00
parent f51554180a
commit 5d61ae8d2a
2 changed files with 9 additions and 52 deletions

View file

@ -253,8 +253,8 @@ struct server_slot {
std::string stdout_reset;
std::string stderr_target;
std::string stderr_reset;
double t_token = t_prompt_processing / num_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
double t_token = t_prompt_processing / n_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
//printf("\033[72;0H]");
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, n_prompt_tokens_processed,
@ -328,7 +328,7 @@ struct server_metrics {
// experimental/diagostic graphic to show kvcache status
// requires just `slots` and `params.n_ctx` as parameters
static void kvgraphics(std::vector<llama_client_slot>& slots) {
static void kvgraphics(std::vector<server_slot>& slots) {
int max_length = 144;
int num_blocks = slots.size();
@ -604,7 +604,7 @@ struct llama_server_context
// the logic seems wrong in this function
// why should there be an id in a task matching a slot.id before a slot has been assigned?
// most commonly id = -1 so we deal with that first rather than the specified id > 0
llama_client_slot* get_slot(int id) {
server_slot* get_slot(int id) {
int64_t t_last = ggml_time_us();
server_slot *last_used = nullptr;
@ -1168,7 +1168,7 @@ struct llama_server_context
queue_results.send(res);
}
json get_formatted_generation(llama_client_slot &slot)
json get_formatted_generation(server_slot &slot)
{
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
@ -1270,7 +1270,7 @@ struct llama_server_context
{"model", params.model_alias},
{"tokens_predicted", slot.n_decoded},
{"tokens_evaluated", slot.n_prompt_tokens},
{"generation_settings", get_formated_generation(slot)},
{"generation_settings", get_formatted_generation(slot)},
{"prompt", slot.prompt},
{"truncated", slot.truncated},
{"stopped_eos", slot.stopped_eos},
@ -1559,7 +1559,7 @@ struct llama_server_context
int n_idle_slots = 0;
int n_processing_slots = 0;
for (llama_client_slot &slot: slots) {
for (server_slot &slot: slots) {
json slot_data = get_formatted_generation(slot);
slot_data["id"] = slot.id;
slot_data["task_id"] = slot.task_id;

View file

@ -137,48 +137,6 @@ struct task_multi {
std::vector<task_result> results{};
};
// TODO: can become bool if we can't find use of more states; MAYBE there is a case for RESERVED to keep slots dedicated to chats?
enum slot_state
{
IDLE,
PROCESSING,
};
enum slot_command
{
NONE,
LOAD_PROMPT,
RELEASE,
};
struct slot_params
{
bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
uint32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_predict = -1; // new tokens to predict
std::vector<std::string> antiprompt;
json input_prefix;
json input_suffix;
};
struct slot_image
{
int32_t id;
bool request_encode_image = false;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
// completion token output with probabilities
struct completion_token_output {
struct token_prob
@ -198,7 +156,6 @@ struct token_translator {
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
};
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
static inline void server_log(
const char *level,
const char *function,
@ -211,8 +168,8 @@ static inline void server_log(
std::string stderr_target,
std::string stdout_reset,
std::string stderr_reset
)
{
)
{
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{