fix spacing

This commit is contained in:
ngxson 2024-02-28 22:06:21 +01:00
parent e2992ea332
commit 51381f8f5d

View file

@ -33,8 +33,7 @@
using json = nlohmann::json; using json = nlohmann::json;
struct server_params struct server_params {
{
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
@ -161,7 +160,7 @@ struct server_slot {
int multitask_id = -1; int multitask_id = -1;
void reset() { void reset() {
n_prompt_tokens = 0; n_prompt_tokens = 0;
generated_text = ""; generated_text = "";
truncated = false; truncated = false;
stopped_eos = false; stopped_eos = false;
@ -177,8 +176,7 @@ struct server_slot {
generated_token_probs.clear(); generated_token_probs.clear();
for (slot_image & img : images) for (slot_image & img : images) {
{
free(img.image_embedding); free(img.image_embedding);
if (img.img_data) { if (img.img_data) {
clip_image_u8_free(img.img_data); clip_image_u8_free(img.img_data);
@ -190,19 +188,15 @@ struct server_slot {
} }
bool has_budget(gpt_params &global_params) { bool has_budget(gpt_params &global_params) {
if (params.n_predict == -1 && global_params.n_predict == -1) if (params.n_predict == -1 && global_params.n_predict == -1) {
{
return true; // limitless return true; // limitless
} }
n_remaining = -1; n_remaining = -1;
if (params.n_predict != -1) if (params.n_predict != -1) {
{
n_remaining = params.n_predict - n_decoded; n_remaining = params.n_predict - n_decoded;
} } else if (global_params.n_predict != -1) {
else if (global_params.n_predict != -1)
{
n_remaining = global_params.n_predict - n_decoded; n_remaining = global_params.n_predict - n_decoded;
} }
@ -218,8 +212,7 @@ struct server_slot {
} }
void add_token_string(const completion_token_output &token) { void add_token_string(const completion_token_output &token) {
if (command == RELEASE) if (command == RELEASE) {
{
return; return;
} }
cache_tokens.push_back(token.tok); cache_tokens.push_back(token.tok);
@ -257,12 +250,12 @@ struct server_slot {
t_prompt_processing, n_prompt_tokens_processed, t_prompt_processing, n_prompt_tokens_processed,
t_token, n_tokens_second); t_token, n_tokens_second);
LOG_INFO(buffer, { LOG_INFO(buffer, {
{"slot_id", id}, {"slot_id", id},
{"task_id", task_id}, {"task_id", task_id},
{"t_prompt_processing", t_prompt_processing}, {"t_prompt_processing", t_prompt_processing},
{"n_prompt_tokens_processed", n_prompt_tokens_processed}, {"n_prompt_tokens_processed", n_prompt_tokens_processed},
{"t_token", t_token}, {"t_token", t_token},
{"n_tokens_second", n_tokens_second}, {"n_tokens_second", n_tokens_second},
}); });
t_token = t_token_generation / n_decoded; t_token = t_token_generation / n_decoded;
@ -352,7 +345,7 @@ struct llama_server_context
std::vector<server_slot> slots; std::vector<server_slot> slots;
json default_generation_settings_for_props; json default_generation_settings_for_props;
llama_server_queue queue_tasks; llama_server_queue queue_tasks;
llama_server_response queue_results; llama_server_response queue_results;
server_metrics metrics; server_metrics metrics;
@ -920,8 +913,8 @@ struct llama_server_context
{ {
if (type == STOP_FULL) if (type == STOP_FULL)
{ {
slot.stopped_word = true; slot.stopped_word = true;
slot.stopping_word = word; slot.stopping_word = word;
slot.has_next_token = false; slot.has_next_token = false;
} }
stop_pos = pos; stop_pos = pos;
@ -1227,9 +1220,7 @@ struct llama_server_context
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
if (!params.embedding) if (!params.embedding)
{ {
LOG_WARNING("embedding disabled", { LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
{"params.embedding", params.embedding},
});
res.result_json = json res.result_json = json
{ {
{"embedding", std::vector<float>(n_embd, 0.0f)}, {"embedding", std::vector<float>(n_embd, 0.0f)},
@ -1241,7 +1232,7 @@ struct llama_server_context
std::vector<float> embedding(data, data + n_embd); std::vector<float> embedding(data, data + n_embd);
res.result_json = json res.result_json = json
{ {
{"embedding", embedding }, {"embedding", embedding},
}; };
} }
queue_results.send(res); queue_results.send(res);
@ -1329,7 +1320,17 @@ struct llama_server_context
} }
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; llama_batch batch_img = {
n_eval,
nullptr,
(img.image_embedding + i * n_embd),
nullptr,
nullptr,
nullptr,
nullptr,
slot.n_past,
1, 0
};
if (llama_decode(ctx, batch_img)) if (llama_decode(ctx, batch_img))
{ {
LOG_TEE("%s : failed to eval image\n", __func__); LOG_TEE("%s : failed to eval image\n", __func__);
@ -1464,13 +1465,13 @@ struct llama_server_context
slot_data["state"] = slot.state; slot_data["state"] = slot.state;
slot_data["prompt"] = slot.prompt; slot_data["prompt"] = slot.prompt;
slot_data["next_token"] = { slot_data["next_token"] = {
{"has_next_token", slot.has_next_token}, {"has_next_token", slot.has_next_token},
{"n_remain", slot.n_remaining}, {"n_remain", slot.n_remaining},
{"num_tokens_predicted", slot.n_decoded}, {"num_tokens_predicted", slot.n_decoded},
{"stopped_eos", slot.stopped_eos}, {"stopped_eos", slot.stopped_eos},
{"stopped_word", slot.stopped_word}, {"stopped_word", slot.stopped_word},
{"stopped_limit", slot.stopped_limit}, {"stopped_limit", slot.stopped_limit},
{"stopping_word", slot.stopping_word}, {"stopping_word", slot.stopping_word},
}; };
if (slot_data["state"] == IDLE) { if (slot_data["state"] == IDLE) {
n_idle_slots++; n_idle_slots++;
@ -1508,10 +1509,10 @@ struct llama_server_context
{ "n_tokens_predicted", metrics.n_tokens_predicted}, { "n_tokens_predicted", metrics.n_tokens_predicted},
{ "t_tokens_generation", metrics.t_tokens_generation}, { "t_tokens_generation", metrics.t_tokens_generation},
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
{ "slots", slots_data }, { "slots", slots_data },
}; };
metrics.reset_bucket(); metrics.reset_bucket();
queue_results.send(res); queue_results.send(res);
@ -1714,13 +1715,18 @@ struct llama_server_context
const int n_block_size = n_left / 2; const int n_block_size = n_left / 2;
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); std::vector<llama_token> new_tokens(
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); prompt_tokens.begin(),
prompt_tokens.begin() + slot.params.n_keep);
new_tokens.insert(
new_tokens.end(),
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
prompt_tokens.end());
LOG_VERBOSE("input truncated", { LOG_VERBOSE("input truncated", {
{"n_ctx", slot.n_ctx}, {"n_ctx", slot.n_ctx},
{"n_keep", slot.params.n_keep}, {"n_keep", slot.params.n_keep},
{"n_left", n_left}, {"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
}); });
slot.truncated = true; slot.truncated = true;
@ -1734,9 +1740,9 @@ struct llama_server_context
{ {
llama_sampling_reset(slot.ctx_sampling); llama_sampling_reset(slot.ctx_sampling);
slot.n_past = 0; slot.n_past = 0;
slot.n_past_se = 0; slot.n_past_se = 0;
slot.ga_i = 0; slot.ga_i = 0;
slot.n_prompt_tokens_processed = slot.n_prompt_tokens; slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
} }
else else
@ -1843,8 +1849,8 @@ struct llama_server_context
if (has_images && !ingest_images(slot, n_batch)) if (has_images && !ingest_images(slot, n_batch))
{ {
LOG_ERROR("failed processing images", { LOG_ERROR("failed processing images", {
"slot_id", slot.id, {"slot_id", slot.id},
"task_id", slot.task_id, {"task_id", slot.task_id},
}); });
// FIXME @phymbert: to be properly tested // FIXME @phymbert: to be properly tested
// early returning without changing the slot state will block the slot for ever // early returning without changing the slot state will block the slot for ever