server : fix multibyte handle in partial response (#3706)

2023-10-21 19:58:03 +08:00 · 2023-10-21 19:58:03 +08:00 · 17b23eb9cb
commit 17b23eb9cb
parent 778c070d1b
1 changed files with 30 additions and 26 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1005,32 +1005,6 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;
        size_t pos = std::min(slot.sent_count, slot.generated_text.size());
        const std::string str_test = slot.generated_text.substr(pos);
        bool is_stop_full = false;
        size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
        if (stop_pos != std::string::npos) {
            is_stop_full = true;
            slot.generated_text.erase(
                slot.generated_text.begin() + pos + stop_pos,
                slot.generated_text.end());
            pos = std::min(slot.sent_count, slot.generated_text.size());
        } else {
            is_stop_full = false;
            stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
        }
        // check if there is any token to predict
        if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
            // no send the stop word in the response
            result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
            slot.sent_count += result.text_to_send.size();
            // add the token to slot queue and cache
        }
        slot.add_token_string(result);
        if(slot.params.stream) {
            send_partial_response(slot, result);
        }
        if (slot.multibyte_pending > 0)
        {
            slot.multibyte_pending -= token_str.size();
@ -1059,6 +1033,36 @@ struct llama_server_context
            }
        }
        if (slot.multibyte_pending == 0)
        {
            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
            const std::string str_test = slot.generated_text.substr(pos);
            bool is_stop_full = false;
            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
            if (stop_pos != std::string::npos) {
                is_stop_full = true;
                slot.generated_text.erase(
                    slot.generated_text.begin() + pos + stop_pos,
                    slot.generated_text.end());
                pos = std::min(slot.sent_count, slot.generated_text.size());
            } else {
                is_stop_full = false;
                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
            }
            // check if there is any token to predict
            if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
                // no send the stop word in the response
                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                slot.sent_count += result.text_to_send.size();
                // add the token to slot queue and cache
            }
            slot.add_token_string(result);
            if (slot.params.stream) {
                send_partial_response(slot, result);
            }
        }
        if (slot.multibyte_pending > 0 && !slot.has_next_token)
        {
            slot.has_next_token = true;