From 3e3f38af482a104c608b20e5aa6d97e57f2f1c45 Mon Sep 17 00:00:00 2001 From: Stephen Nichols Date: Tue, 25 Jul 2023 11:57:29 -0500 Subject: [PATCH] Fixing race condition in server.cpp and partial stream handling in completion.js --- examples/server/public/completion.js | 55 +++++++++++++++++++--------- examples/server/server.cpp | 8 +++- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index a43d5a7d5..4bc8e7804 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -43,6 +43,7 @@ export async function* llama(prompt, params = {}, config = {}) { const decoder = new TextDecoder(); let content = ""; + let leftover = ""; // Buffer for partially read lines try { let cont = true; @@ -53,29 +54,47 @@ export async function* llama(prompt, params = {}, config = {}) { break; } - // sse answers in the form multiple lines of: value\n with data always present as a key. in our case we - // mainly care about the data: key here, which we expect as json - const text = decoder.decode(result.value); + // Add any leftover data to the current chunk of data + const text = leftover + decoder.decode(result.value); - // parse all sse events and add them to result - const regex = /^(\S+):\s(.*)$/gm; - for (const match of text.matchAll(regex)) { - result[match[1]] = match[2] + // Check if the last character is a line break + const endsWithLineBreak = text.endsWith('\n'); + + // Split the text into lines + let lines = text.split('\n'); + + // If the text doesn't end with a line break, then the last line is incomplete + // Store it in leftover to be added to the next chunk of data + if (!endsWithLineBreak) { + leftover = lines.pop(); + } else { + leftover = ""; // Reset leftover if we have a line break at the end } - // since we know this is llama.cpp, let's just decode the json in data - result.data = JSON.parse(result.data); - content += result.data.content; + // Parse all sse events and add them to result + const regex = /^(\S+):\s(.*)$/gm; + for (const line of lines) { + const match = regex.exec(line); + if (match) { + result[match[1]] = match[2] + // since we know this is llama.cpp, let's just decode the json in data + if (result.data) { + result.data = JSON.parse(result.data); + content += result.data.content; - // yield - yield result; + // yield + yield result; - // if we got a stop token from server, we will break here - if (result.data.stop) { - if (result.data.generation_settings) { - generation_settings = result.data.generation_settings; + // if we got a stop token from server, we will break here + if (result.data.stop) { + if (result.data.generation_settings) { + generation_settings = result.data.generation_settings; + } + cont = false; + break; + } + } } - break; } } } catch (e) { @@ -165,4 +184,4 @@ export const llamaModelInfo = async () => { generation_settings = await fetch("/model.json").then(r => r.json()); } return generation_settings; -} +} \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 83c03065a..5f05320ba 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -950,7 +950,7 @@ static json format_timings(llama_server_context &llama) { const auto timings = llama_get_timings(llama.ctx); - assert(timings.n_eval == llama.num_tokens_predicted); + // assert(timings.n_eval == llama.num_tokens_predicted); return json{ {"prompt_n", timings.n_eval}, @@ -1263,7 +1263,11 @@ int main(int argc, char **argv) sink.done(); return true; }; - res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + const auto on_complete = [&](bool) { + llama.mutex.unlock(); + }; + lock.release(); + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); } }); svr.Get("/model.json", [&llama](const Request &, Response &res)