From 51e09944cef163c5b8bb88d91a25541e5fa606e3 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Sun, 28 May 2023 02:42:18 +0300 Subject: [PATCH] server rewrite Remove unnecessary things and radically rewrite server --- examples/server/server.cpp | 202 +++++++------------------------------ 1 file changed, 35 insertions(+), 167 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 356ec1b7b..243da3564 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -15,19 +15,16 @@ struct llama_server_context bool has_next_token = false; std::string generated_text = ""; - int32_t num_tokens_predicted = 0; - int32_t n_past = 0; - int32_t n_consumed = 0; - int32_t n_session_consumed = 0; - int32_t n_remain = 0; + size_t num_tokens_predicted = 0; + size_t n_past = 0; + size_t n_consumed = 0; + size_t n_session_consumed = 0; + size_t n_remain = 0; std::vector embd; std::vector last_n_tokens; std::vector processed_tokens; - std::vector llama_token_newline; std::vector embd_inp; - std::vector> no_show_words; - std::vector tokens_predicted; std::vector last_prompt_tokens; @@ -37,9 +34,14 @@ struct llama_server_context void rewind() { as_loop = false; params.antiprompt.clear(); - no_show_words.clear(); num_tokens_predicted = 0; generated_text = ""; + + //processed_tokens.clear(); + embd_inp.clear(); + n_remain = 0; + n_past = 0; + n_consumed = 0; } bool loadModel(gpt_params params_) @@ -51,8 +53,7 @@ struct llama_server_context fprintf(stderr, "%s: error: unable to load model\n", __func__); return false; } - // determine newline token - llama_token_newline = ::llama_tokenize(ctx, "\n", false); + last_n_tokens.resize(params.n_ctx); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); return true; @@ -62,53 +63,14 @@ struct llama_server_context params.prompt.insert(0, 1, ' '); // always add a first space std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt - int new_prompt_len = 0; - if (last_prompt_tokens == prompt_tokens) - { - //fprintf(stdout, "Context matched.\n"); - processed_tokens = last_prompt_tokens; - embd_inp = last_prompt_tokens; - n_past = processed_tokens.size(); - n_consumed = last_prompt_tokens.size() - 2; - has_next_token = true; - return true; - } - else - { - if (!processed_tokens.empty() && !embd_inp.empty()) - { - //fprintf(stdout, "Resetting context.\n"); - processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end()); - embd_inp.erase(embd_inp.begin() + 1, embd_inp.end()); - n_consumed = 0; - n_past = 0; + for (n_past = 0; n_past < prompt_tokens.size() - 1 && n_past < processed_tokens.size(); n_past++) { + if (prompt_tokens[n_past] != processed_tokens[n_past]) { + break; } } - for (size_t i = 0; i < prompt_tokens.size(); i++) { - if (i < processed_tokens.size() && - processed_tokens[i] == prompt_tokens[i]) - { - continue; - } - else - { - embd_inp.push_back(prompt_tokens[i]); - if(new_prompt_len == 0) { - if(int32_t(i) - 1 < n_past) { - processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); - } - // Evaluate the new fragment prompt from the last token processed. - n_past = processed_tokens.size(); - } - new_prompt_len ++; - } - } - if(n_past > 0 && params.interactive) { - n_remain -= new_prompt_len; - } - if ((int)embd_inp.size() > params.n_ctx - 4) - { - return false; + processed_tokens.resize(n_past); + if (prompt_tokens.size() > n_past) { + embd_inp.insert(embd_inp.end(), prompt_tokens.begin() + n_past, prompt_tokens.end()); } last_prompt_tokens = prompt_tokens; has_next_token = true; @@ -131,7 +93,7 @@ struct llama_server_context llama_token result = -1; if (embd.size() > 0) { - if (n_past + (int)embd.size() > params.n_ctx) + if (n_past + embd.size() > (size_t)params.n_ctx) { // Reset context const int n_left = n_past - params.n_keep; @@ -156,7 +118,7 @@ struct llama_server_context } } embd.clear(); - if ((int)embd_inp.size() <= n_consumed && has_next_token) + if (embd_inp.size() <= n_consumed) { // out of user input, sample next token const float temp = params.temp; @@ -243,18 +205,6 @@ struct llama_server_context num_tokens_predicted++; } - // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive) - { - id = llama_token_newline.front(); - if (params.antiprompt.size() != 0) - { - // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); - embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); - } - } - // add it to the context embd.push_back(id); result = id; @@ -264,7 +214,7 @@ struct llama_server_context else { // some user input remains from prompt or interaction, forward it to processing - while ((int)embd_inp.size() > n_consumed) + while (embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); last_n_tokens.erase(last_n_tokens.begin()); @@ -277,41 +227,11 @@ struct llama_server_context } } } - if (params.interactive && (int)embd_inp.size() <= n_consumed) - { - // check for reverse prompt - if (params.antiprompt.size()) - { - std::string last_output; - for (auto id : last_n_tokens) - { - last_output += llama_token_to_str(ctx, id); - } - has_next_token = true; - // Check if each of the reverse prompts appears at the end of the output. - for (std::string &antiprompt : params.antiprompt) - { - if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) - { - has_next_token = false; - return result; - } - } - } - if (n_past > 0) - { - has_next_token = true; - } - } if (!embd.empty() && embd.back() == llama_token_eos()) { has_next_token = false; } - if (params.interactive && n_remain <= 0 && params.n_predict != -1) - { - n_remain = params.n_predict; - } has_next_token = n_remain != 0; return result; } @@ -322,58 +242,23 @@ struct llama_server_context if (token == -1) { return ""; } - tokens_predicted.clear(); - tokens_predicted.push_back(token); - - // Avoid add the no show words to the response - for (std::vector word_tokens : no_show_words) - { - size_t match_token = 1; - if (tokens_predicted.front() == word_tokens.front()) - { - bool execute_matching = true; - if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (size_t i = 1; i < word_tokens.size(); i++) - { - if (i >= tokens_predicted.size()) { - match_token = i; - break; - } - if (tokens_predicted[i] == word_tokens[i]) - { - continue; - } - else - { - execute_matching = false; - break; - } - } - } - while (execute_matching) { - if (match_token == word_tokens.size()) { - return ""; - } - token = nextToken(); - tokens_predicted.push_back(token); - if (token == word_tokens[match_token]) - { // the token follow the sequence - match_token++; - } - else if (match_token < word_tokens.size()) - { // no complete all word sequence - break; - } - } - } - } + if(as_loop) { generated_text = ""; } - for (llama_token tkn : tokens_predicted) - { - generated_text += llama_token_to_str(ctx, tkn); + + std::string token_text = llama_token_to_str(ctx, token); + generated_text += token_text; + + for (std::string word : params.antiprompt) { + size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); + if (i != std::string::npos) { + generated_text.erase(generated_text.begin() + i, generated_text.begin() + i + word.size()); + has_next_token = false; + break; + } } + return generated_text; } @@ -616,10 +501,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.as_loop = body["as_loop"].get(); } - if (!body["interactive"].is_null()) - { - llama.params.interactive = body["interactive"].get(); - } if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); @@ -635,20 +516,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & } if (!body["stop"].is_null()) { - std::vector stop_words = body["stop"].get>(); - for (std::string stop_word : stop_words) - { - llama.params.antiprompt.push_back(stop_word); - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false)); - } - } - if (!body["exclude"].is_null()) - { - std::vector no_show_words = body["exclude"].get>(); - for (std::string no_show : no_show_words) - { - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false)); - } + llama.params.antiprompt = body["stop"].get>(); } return true; }