diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4a2b56598..d6bab439a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -143,19 +143,19 @@ struct server_slot { double t_token_generation; // ms void reset() { - n_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - n_past = 0; - n_sent_text = 0; - n_sent_token_probs = 0; - infill = false; - ga_i = 0; - n_past_se = 0; + n_prompt_tokens = 0; + generated_text = ""; + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + n_past = 0; + n_sent_text = 0; + n_sent_token_probs = 0; + infill = false; + ga_i = 0; + n_past_se = 0; generated_token_probs.clear(); } @@ -433,8 +433,7 @@ struct llama_server_context { batch = llama_batch_init(n_ctx, 0, params.n_parallel); } - std::vector tokenize(const json & json_prompt, bool add_bos) const - { + std::vector tokenize(const json & json_prompt, bool add_bos) const { // TODO: currently, we tokenize using special tokens by default // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) // but it's better compared to completely ignoring ChatML and other chat templates @@ -444,32 +443,26 @@ struct llama_server_context { // or the first element of the json_prompt array is a string. std::vector prompt_tokens; - if (json_prompt.is_array()) - { + if (json_prompt.is_array()) { bool first = true; - for (const auto& p : json_prompt) - { - if (p.is_string()) - { + for (const auto & p : json_prompt) { + if (p.is_string()) { auto s = p.template get(); + std::vector p; - if (first) - { + if (first) { p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); first = false; - } - else - { + } else { p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } - else - { - if (first) - { + } else { + if (first) { first = false; } + prompt_tokens.push_back(p.template get()); } } @@ -499,7 +492,7 @@ struct llama_server_context { return last_used; } - bool launch_slot_with_data(server_slot & slot, json data) { + bool launch_slot_with_data(server_slot & slot, json data) const { slot_params default_params; llama_sampling_params default_sparams; @@ -862,19 +855,6 @@ struct llama_server_context { return slot.has_next_token; // continue } - void send_error(const task_server & task, const std::string & error) { - LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); - - task_result res; - res.id = task.id; - res.id_multi = task.id_multi; - res.stop = false; - res.error = true; - res.result_json = { { "content", error } }; - - queue_results.send(res); - } - json get_formated_generation(const server_slot & slot) const { const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -921,6 +901,19 @@ struct llama_server_context { }; } + void send_error(const task_server & task, const std::string & error) { + LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); + + task_result res; + res.id = task.id; + res.id_multi = task.id_multi; + res.stop = false; + res.error = true; + res.result_json = { { "content", error } }; + + queue_results.send(res); + } + void send_partial_response(server_slot & slot, completion_token_output tkn) { task_result res; res.id = slot.id_task; @@ -1057,13 +1050,13 @@ struct llama_server_context { void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding) { task_server task; - task.id = id_task; - task.id_multi = id_multi; - task.id_target = 0; - task.data = std::move(data); + task.id = id_task; + task.id_multi = id_multi; + task.id_target = 0; + task.data = std::move(data); task.infill = infill; task.embedding = embedding; - task.type = TASK_TYPE_COMPLETION; + task.type = TASK_TYPE_COMPLETION; // when a completion task's prompt array is not a singleton, we split it into multiple requests // otherwise, it's a single-prompt task, we actually queue it @@ -1092,17 +1085,16 @@ struct llama_server_context { } } - void request_cancel(int id_task) - { + void request_cancel(int id_task) { task_server task; - task.type = TASK_TYPE_CANCEL; + task.type = TASK_TYPE_CANCEL; task.id_target = id_task; + queue_tasks.post(task); } - void split_multiprompt_task(int id_multi, task_server& multiprompt_task) - { - int prompt_count = multiprompt_task.data.at("prompt").size(); + void split_multiprompt_task(int id_multi, const task_server & multiprompt_task) { + const int prompt_count = multiprompt_task.data.at("prompt").size(); if (prompt_count <= 1) { send_error(multiprompt_task, "error while handling multiple prompts"); return; @@ -1110,8 +1102,7 @@ struct llama_server_context { // generate all the ID for subtask std::vector subtask_ids(prompt_count); - for (int i = 0; i < prompt_count; i++) - { + for (int i = 0; i < prompt_count; i++) { subtask_ids[i] = queue_tasks.get_new_id(); } @@ -1119,8 +1110,7 @@ struct llama_server_context { queue_tasks.add_multitask(id_multi, subtask_ids); // add subtasks - for (int i = 0; i < prompt_count; i++) - { + for (int i = 0; i < prompt_count; i++) { json subtask_data = multiprompt_task.data; subtask_data["prompt"] = subtask_data["prompt"][i]; @@ -1129,7 +1119,7 @@ struct llama_server_context { } } - void process_single_task(task_server & task) { + void process_single_task(const task_server & task) { switch (task.type) { case TASK_TYPE_COMPLETION: { @@ -1145,8 +1135,7 @@ struct llama_server_context { system_prompt_process(task.data["system_prompt"]); // reset cache_tokens for all slots - for (server_slot & slot : slots) - { + for (server_slot & slot : slots) { slot.cache_tokens.clear(); slot.n_past = 0; slot.n_past_se = 0; @@ -1182,11 +1171,12 @@ struct llama_server_context { } break; case TASK_TYPE_METRICS: { - json slots_data = json::array(); + json slots_data = json::array(); + int n_idle_slots = 0; int n_processing_slots = 0; - for (server_slot & slot: slots) { + for (server_slot & slot : slots) { json slot_data = get_formated_generation(slot); slot_data["id"] = slot.id; slot_data["id_task"] = slot.id_task; @@ -1201,24 +1191,26 @@ struct llama_server_context { {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, }; + if (slot_data["state"] == IDLE) { n_idle_slots++; } else { n_processing_slots++; } + slots_data.push_back(slot_data); } LOG_INFO("slot data", { {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots} + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots} }); LOG_VERBOSE("slot data", { {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots}, - {"slots", slots_data} + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots}, + {"slots", slots_data} }); task_result res; @@ -1264,7 +1256,7 @@ struct llama_server_context { result_jsons.push_back(subres.result_json); result.error = result.error && subres.error; } - result.result_json = json{ { "results", result_jsons } }; + result.result_json = json { { "results", result_jsons } }; queue_results.send(result); } @@ -1316,9 +1308,9 @@ struct llama_server_context { } } - LOG_VERBOSE("posting NEXT_RESPONSE", {}); - { + LOG_VERBOSE("posting NEXT_RESPONSE", {}); + task_server task; task.type = TASK_TYPE_NEXT_RESPONSE; task.id_target = -1; @@ -1410,22 +1402,21 @@ struct llama_server_context { } // need process the prompt - if (slot.state == IDLE && slot.command == LOAD_PROMPT) - { + if (slot.state == IDLE && slot.command == LOAD_PROMPT) { slot.state = PROCESSING; slot.command = NONE; + std::vector prompt_tokens; slot.t_start_process_prompt = ggml_time_us(); slot.t_start_genereration = 0; - if (slot.infill) - { + if (slot.infill) { bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) - { + if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } + auto prefix_tokens = tokenize(slot.params.input_prefix, false); auto suffix_tokens = tokenize(slot.params.input_suffix, false); @@ -1440,23 +1431,19 @@ struct llama_server_context { prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); prefix_tokens.push_back(llama_token_middle(model)); prompt_tokens = prefix_tokens; - } - else - { + } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt } slot.n_prompt_tokens = prompt_tokens.size(); - if (slot.params.n_keep < 0) - { + if (slot.params.n_keep < 0) { slot.params.n_keep = slot.n_prompt_tokens; } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); // if input prompt is too big, truncate it, if group attention self-extend is disabled - if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) - { + if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; @@ -1476,6 +1463,7 @@ struct llama_server_context { {"n_left", n_left}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, }); + slot.truncated = true; prompt_tokens = new_tokens; @@ -1483,8 +1471,7 @@ struct llama_server_context { GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - if (!slot.params.cache_prompt) - { + if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); slot.n_past = 0; @@ -1492,12 +1479,9 @@ struct llama_server_context { slot.ga_i = 0; slot.n_prompt_tokens_processed = slot.n_prompt_tokens; - } - else - { + } else { // push the prompt into the sampling context (do not apply grammar) - for (auto & token : prompt_tokens) - { + for (auto & token : prompt_tokens) { llama_sampling_accept(slot.ctx_sampling, ctx, token, false); } @@ -1505,21 +1489,18 @@ struct llama_server_context { // the last token of the cache is not in the KV cache until the next call to llama_decode // (it was sampled, pushed into the "cache_tokens", but not yet put in the context) - if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size()) - { + if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size()) { slot.n_past -= 1; } slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past; - if (slot.ga_n != 1) - { + if (slot.ga_n != 1) { int ga_i = 0; int32_t ga_n = slot.ga_n; int32_t ga_w = slot.ga_w; int32_t slot_npast = 0; - for (int k = 0; k < slot.n_past; ++k) - { + for (int k = 0; k < slot.n_past; ++k) { while (slot_npast >= ga_i + ga_w) { const int bd = (ga_w/ga_n)*(ga_n - 1); slot_npast -= bd; @@ -1543,8 +1524,7 @@ struct llama_server_context { slot.cache_tokens = prompt_tokens; - if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) - { + if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. LOG_INFO("we have to evaluate at least 1 token to generate logits", { { "id_slot", slot.id }, @@ -1580,23 +1560,22 @@ struct llama_server_context { int32_t ga_n = slot.ga_n; int32_t ga_w = slot.ga_w; - for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) - { - if (slot.ga_n != 1) - { + for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) { + if (slot.ga_n != 1) { while (slot_npast >= ga_i + ga_w) { const int bd = (ga_w/ga_n)*(ga_n - 1); slot_npast -= bd; ga_i += ga_w/ga_n; } } + llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false); + slot_npast++; } // extract the logits only for the last token - if (batch.n_tokens > 0) - { + if (batch.n_tokens > 0) { batch.logits[batch.n_tokens - 1] = true; } @@ -1728,12 +1707,12 @@ struct llama_server_context { json model_meta() const { return json { - {"vocab_type", llama_vocab_type(model)}, - {"n_vocab", llama_n_vocab(model)}, - {"n_ctx_train", llama_n_ctx_train(model)}, - {"n_embd", llama_n_embd(model)}, + {"vocab_type", llama_vocab_type (model)}, + {"n_vocab", llama_n_vocab (model)}, + {"n_ctx_train", llama_n_ctx_train (model)}, + {"n_embd", llama_n_embd (model)}, {"n_params", llama_model_n_params(model)}, - {"size", llama_model_size(model)}, + {"size", llama_model_size (model)}, }; } }; @@ -1756,17 +1735,14 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); - printf(" --pooling {none,mean,cls}\n"); - printf(" pooling type for embeddings, use model default if unspecified\n"); + printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n"); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); - if (llama_supports_mlock()) - { + if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_supports_mmap()) - { + if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); @@ -2302,8 +2278,7 @@ int main(int argc, char ** argv) { server_params_parse(argc, argv, sparams, params, llama); - if (params.model_alias == "unknown") - { + if (params.model_alias == "unknown") { params.model_alias = params.model; } @@ -2643,6 +2618,7 @@ int main(int argc, char ** argv) { json data = json::parse(req.body); const int id_task = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(id_task); llama.request_completion(id_task, -1, data, false, false); @@ -2741,6 +2717,7 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template); const int id_task = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(id_task); llama.request_completion(id_task, -1, data, false, false); @@ -2821,8 +2798,10 @@ int main(int argc, char ** argv) { json data = json::parse(req.body); const int id_task = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(id_task); llama.request_completion(id_task, -1, data, true, false); + if (!json_value(data, "stream", false)) { task_result result = llama.queue_results.recv(id_task); if (!result.error && result.stop) { @@ -2915,6 +2894,7 @@ int main(int argc, char ** argv) { // create and queue the task const int id_task = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(id_task); llama.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0} }, false, true); @@ -2939,6 +2919,7 @@ int main(int argc, char ** argv) { int i = 0; for (const json & elem : prompt) { const int id_task = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(id_task); llama.request_completion(id_task, -1, { {"prompt", elem}, { "n_predict", 0} }, false, true); @@ -2965,6 +2946,7 @@ int main(int argc, char ** argv) { // create and queue the task const int id_task = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(id_task); llama.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true);