fix spacing
This commit is contained in:
parent
e2992ea332
commit
51381f8f5d
1 changed files with 51 additions and 45 deletions
|
@ -33,8 +33,7 @@
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
struct server_params
|
struct server_params {
|
||||||
{
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
|
@ -161,7 +160,7 @@ struct server_slot {
|
||||||
int multitask_id = -1;
|
int multitask_id = -1;
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
n_prompt_tokens = 0;
|
n_prompt_tokens = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
truncated = false;
|
truncated = false;
|
||||||
stopped_eos = false;
|
stopped_eos = false;
|
||||||
|
@ -177,8 +176,7 @@ struct server_slot {
|
||||||
|
|
||||||
generated_token_probs.clear();
|
generated_token_probs.clear();
|
||||||
|
|
||||||
for (slot_image & img : images)
|
for (slot_image & img : images) {
|
||||||
{
|
|
||||||
free(img.image_embedding);
|
free(img.image_embedding);
|
||||||
if (img.img_data) {
|
if (img.img_data) {
|
||||||
clip_image_u8_free(img.img_data);
|
clip_image_u8_free(img.img_data);
|
||||||
|
@ -190,19 +188,15 @@ struct server_slot {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool has_budget(gpt_params &global_params) {
|
bool has_budget(gpt_params &global_params) {
|
||||||
if (params.n_predict == -1 && global_params.n_predict == -1)
|
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
||||||
{
|
|
||||||
return true; // limitless
|
return true; // limitless
|
||||||
}
|
}
|
||||||
|
|
||||||
n_remaining = -1;
|
n_remaining = -1;
|
||||||
|
|
||||||
if (params.n_predict != -1)
|
if (params.n_predict != -1) {
|
||||||
{
|
|
||||||
n_remaining = params.n_predict - n_decoded;
|
n_remaining = params.n_predict - n_decoded;
|
||||||
}
|
} else if (global_params.n_predict != -1) {
|
||||||
else if (global_params.n_predict != -1)
|
|
||||||
{
|
|
||||||
n_remaining = global_params.n_predict - n_decoded;
|
n_remaining = global_params.n_predict - n_decoded;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -218,8 +212,7 @@ struct server_slot {
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_token_string(const completion_token_output &token) {
|
void add_token_string(const completion_token_output &token) {
|
||||||
if (command == RELEASE)
|
if (command == RELEASE) {
|
||||||
{
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cache_tokens.push_back(token.tok);
|
cache_tokens.push_back(token.tok);
|
||||||
|
@ -257,12 +250,12 @@ struct server_slot {
|
||||||
t_prompt_processing, n_prompt_tokens_processed,
|
t_prompt_processing, n_prompt_tokens_processed,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_INFO(buffer, {
|
LOG_INFO(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
{"t_prompt_processing", t_prompt_processing},
|
{"t_prompt_processing", t_prompt_processing},
|
||||||
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
|
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
|
||||||
{"t_token", t_token},
|
{"t_token", t_token},
|
||||||
{"n_tokens_second", n_tokens_second},
|
{"n_tokens_second", n_tokens_second},
|
||||||
});
|
});
|
||||||
|
|
||||||
t_token = t_token_generation / n_decoded;
|
t_token = t_token_generation / n_decoded;
|
||||||
|
@ -352,7 +345,7 @@ struct llama_server_context
|
||||||
std::vector<server_slot> slots;
|
std::vector<server_slot> slots;
|
||||||
json default_generation_settings_for_props;
|
json default_generation_settings_for_props;
|
||||||
|
|
||||||
llama_server_queue queue_tasks;
|
llama_server_queue queue_tasks;
|
||||||
llama_server_response queue_results;
|
llama_server_response queue_results;
|
||||||
|
|
||||||
server_metrics metrics;
|
server_metrics metrics;
|
||||||
|
@ -920,8 +913,8 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
if (type == STOP_FULL)
|
if (type == STOP_FULL)
|
||||||
{
|
{
|
||||||
slot.stopped_word = true;
|
slot.stopped_word = true;
|
||||||
slot.stopping_word = word;
|
slot.stopping_word = word;
|
||||||
slot.has_next_token = false;
|
slot.has_next_token = false;
|
||||||
}
|
}
|
||||||
stop_pos = pos;
|
stop_pos = pos;
|
||||||
|
@ -1227,9 +1220,7 @@ struct llama_server_context
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
if (!params.embedding)
|
if (!params.embedding)
|
||||||
{
|
{
|
||||||
LOG_WARNING("embedding disabled", {
|
LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
|
||||||
{"params.embedding", params.embedding},
|
|
||||||
});
|
|
||||||
res.result_json = json
|
res.result_json = json
|
||||||
{
|
{
|
||||||
{"embedding", std::vector<float>(n_embd, 0.0f)},
|
{"embedding", std::vector<float>(n_embd, 0.0f)},
|
||||||
|
@ -1241,7 +1232,7 @@ struct llama_server_context
|
||||||
std::vector<float> embedding(data, data + n_embd);
|
std::vector<float> embedding(data, data + n_embd);
|
||||||
res.result_json = json
|
res.result_json = json
|
||||||
{
|
{
|
||||||
{"embedding", embedding },
|
{"embedding", embedding},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
|
@ -1329,7 +1320,17 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
|
llama_batch batch_img = {
|
||||||
|
n_eval,
|
||||||
|
nullptr,
|
||||||
|
(img.image_embedding + i * n_embd),
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
slot.n_past,
|
||||||
|
1, 0
|
||||||
|
};
|
||||||
if (llama_decode(ctx, batch_img))
|
if (llama_decode(ctx, batch_img))
|
||||||
{
|
{
|
||||||
LOG_TEE("%s : failed to eval image\n", __func__);
|
LOG_TEE("%s : failed to eval image\n", __func__);
|
||||||
|
@ -1464,13 +1465,13 @@ struct llama_server_context
|
||||||
slot_data["state"] = slot.state;
|
slot_data["state"] = slot.state;
|
||||||
slot_data["prompt"] = slot.prompt;
|
slot_data["prompt"] = slot.prompt;
|
||||||
slot_data["next_token"] = {
|
slot_data["next_token"] = {
|
||||||
{"has_next_token", slot.has_next_token},
|
{"has_next_token", slot.has_next_token},
|
||||||
{"n_remain", slot.n_remaining},
|
{"n_remain", slot.n_remaining},
|
||||||
{"num_tokens_predicted", slot.n_decoded},
|
{"num_tokens_predicted", slot.n_decoded},
|
||||||
{"stopped_eos", slot.stopped_eos},
|
{"stopped_eos", slot.stopped_eos},
|
||||||
{"stopped_word", slot.stopped_word},
|
{"stopped_word", slot.stopped_word},
|
||||||
{"stopped_limit", slot.stopped_limit},
|
{"stopped_limit", slot.stopped_limit},
|
||||||
{"stopping_word", slot.stopping_word},
|
{"stopping_word", slot.stopping_word},
|
||||||
};
|
};
|
||||||
if (slot_data["state"] == IDLE) {
|
if (slot_data["state"] == IDLE) {
|
||||||
n_idle_slots++;
|
n_idle_slots++;
|
||||||
|
@ -1508,10 +1509,10 @@ struct llama_server_context
|
||||||
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||||
{ "t_tokens_generation", metrics.t_tokens_generation},
|
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||||
|
|
||||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||||
|
|
||||||
{ "slots", slots_data },
|
{ "slots", slots_data },
|
||||||
};
|
};
|
||||||
metrics.reset_bucket();
|
metrics.reset_bucket();
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
|
@ -1714,13 +1715,18 @@ struct llama_server_context
|
||||||
const int n_block_size = n_left / 2;
|
const int n_block_size = n_left / 2;
|
||||||
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
||||||
|
|
||||||
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
|
std::vector<llama_token> new_tokens(
|
||||||
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
|
prompt_tokens.begin(),
|
||||||
|
prompt_tokens.begin() + slot.params.n_keep);
|
||||||
|
new_tokens.insert(
|
||||||
|
new_tokens.end(),
|
||||||
|
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
||||||
|
prompt_tokens.end());
|
||||||
|
|
||||||
LOG_VERBOSE("input truncated", {
|
LOG_VERBOSE("input truncated", {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
{"n_keep", slot.params.n_keep},
|
{"n_keep", slot.params.n_keep},
|
||||||
{"n_left", n_left},
|
{"n_left", n_left},
|
||||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||||
});
|
});
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
|
@ -1734,9 +1740,9 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
slot.n_past_se = 0;
|
slot.n_past_se = 0;
|
||||||
slot.ga_i = 0;
|
slot.ga_i = 0;
|
||||||
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -1843,8 +1849,8 @@ struct llama_server_context
|
||||||
if (has_images && !ingest_images(slot, n_batch))
|
if (has_images && !ingest_images(slot, n_batch))
|
||||||
{
|
{
|
||||||
LOG_ERROR("failed processing images", {
|
LOG_ERROR("failed processing images", {
|
||||||
"slot_id", slot.id,
|
{"slot_id", slot.id},
|
||||||
"task_id", slot.task_id,
|
{"task_id", slot.task_id},
|
||||||
});
|
});
|
||||||
// FIXME @phymbert: to be properly tested
|
// FIXME @phymbert: to be properly tested
|
||||||
// early returning without changing the slot state will block the slot for ever
|
// early returning without changing the slot state will block the slot for ever
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue