avoid unnecessary empty data event & send rest of partial tokens on stop

This commit is contained in:
Jhen 2023-08-23 15:52:49 +08:00
parent 3fc1127e2f
commit 3f436ea3f3

View file

@ -1330,21 +1330,27 @@ int main(int argc, char **argv)
size_t pos = std::min(sent_count, llama.generated_text.size());
const std::string str_test = llama.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos =
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
if (stop_pos != std::string::npos) {
is_stop_full = true;
llama.generated_text.erase(
llama.generated_text.begin() + pos + stop_pos,
llama.generated_text.end());
pos = std::min(sent_count, llama.generated_text.size());
} else {
is_stop_full = false;
stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
STOP_PARTIAL);
}
const std::string to_send = stop_pos == std::string::npos
? llama.generated_text.substr(pos, std::string::npos)
: ""; // just don't send anything if we're not done
if (
stop_pos == std::string::npos ||
// Send rest of the text if we are at the end of the generation
(!llama.has_next_token && !is_stop_full && stop_pos > 0)
) {
const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
sent_count += to_send.size();
@ -1360,9 +1366,6 @@ int main(int argc, char **argv)
sent_token_probs_index = probs_stop_pos;
}
{
// Always send partial response
// so we can get the correct partial response of the last to_send in the client
const json data = format_partial_response(llama, to_send, probs_output);
const std::string str =