From 3e3f38af482a104c608b20e5aa6d97e57f2f1c45 Mon Sep 17 00:00:00 2001
From: Stephen Nichols <snichols@therealm.io>
Date: Tue, 25 Jul 2023 11:57:29 -0500
Subject: [PATCH] Fixing race condition in server.cpp and partial stream
 handling in completion.js

---
 examples/server/public/completion.js | 55 +++++++++++++++++++---------
 examples/server/server.cpp           |  8 +++-
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
index a43d5a7d5..4bc8e7804 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -43,6 +43,7 @@ export async function* llama(prompt, params = {}, config = {}) {
   const decoder = new TextDecoder();
 
   let content = "";
+  let leftover = ""; // Buffer for partially read lines
 
   try {
     let cont = true;
@@ -53,29 +54,47 @@ export async function* llama(prompt, params = {}, config = {}) {
         break;
       }
 
-      // sse answers in the form multiple lines of: value\n with data always present as a key. in our case we
-      // mainly care about the data: key here, which we expect as json
-      const text = decoder.decode(result.value);
+      // Add any leftover data to the current chunk of data
+      const text = leftover + decoder.decode(result.value);
 
-      // parse all sse events and add them to result
-      const regex = /^(\S+):\s(.*)$/gm;
-      for (const match of text.matchAll(regex)) {
-        result[match[1]] = match[2]
+      // Check if the last character is a line break
+      const endsWithLineBreak = text.endsWith('\n');
+
+      // Split the text into lines
+      let lines = text.split('\n');
+
+      // If the text doesn't end with a line break, then the last line is incomplete
+      // Store it in leftover to be added to the next chunk of data
+      if (!endsWithLineBreak) {
+        leftover = lines.pop();
+      } else {
+        leftover = ""; // Reset leftover if we have a line break at the end
       }
 
-      // since we know this is llama.cpp, let's just decode the json in data
-      result.data = JSON.parse(result.data);
-      content += result.data.content;
+      // Parse all sse events and add them to result
+      const regex = /^(\S+):\s(.*)$/gm;
+      for (const line of lines) {
+        const match = regex.exec(line);
+        if (match) {
+          result[match[1]] = match[2]
+          // since we know this is llama.cpp, let's just decode the json in data
+          if (result.data) {
+            result.data = JSON.parse(result.data);
+            content += result.data.content;
 
-      // yield
-      yield result;
+            // yield
+            yield result;
 
-      // if we got a stop token from server, we will break here
-      if (result.data.stop) {
-        if (result.data.generation_settings) {
-          generation_settings = result.data.generation_settings;
+            // if we got a stop token from server, we will break here
+            if (result.data.stop) {
+              if (result.data.generation_settings) {
+                generation_settings = result.data.generation_settings;
+              }
+              cont = false;
+              break;
+            }
+          }
         }
-        break;
       }
     }
   } catch (e) {
@@ -165,4 +184,4 @@ export const llamaModelInfo = async () => {
     generation_settings = await fetch("/model.json").then(r => r.json());
   }
   return generation_settings;
-}
+}
\ No newline at end of file
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 83c03065a..5f05320ba 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -950,7 +950,7 @@ static json format_timings(llama_server_context &llama)
 {
     const auto timings = llama_get_timings(llama.ctx);
 
-    assert(timings.n_eval == llama.num_tokens_predicted);
+    // assert(timings.n_eval == llama.num_tokens_predicted);
 
     return json{
         {"prompt_n", timings.n_eval},
@@ -1263,7 +1263,11 @@ int main(int argc, char **argv)
                 sink.done();
                 return true;
             };
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
+            const auto on_complete = [&](bool) {
+                llama.mutex.unlock();
+            };
+            lock.release();
+            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
         } });
 
     svr.Get("/model.json", [&llama](const Request &, Response &res)