From b31ba86ace4fe8605bb0fa06fbfd5b7781ca6f88 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Fri, 28 Apr 2023 16:15:26 -0700
Subject: [PATCH] Introduce prompt caching so prompts load instantly

This change also introduces an ephemeral status line in non-verbose mode
to display a load percentage status when slow operations are happening.
---
 .gitignore                    |   1 +
 third_party/ggml/README.cosmo |   2 +
 third_party/ggml/common.cc    |   4 +-
 third_party/ggml/common.h     |   5 +-
 third_party/ggml/ggml.mk      |   1 +
 third_party/ggml/llama.cc     |  65 +++---
 third_party/ggml/main.cc      | 358 ++++++++++++++++++++++++++++------
 7 files changed, 333 insertions(+), 103 deletions(-)

diff --git a/.gitignore b/.gitignore
index 764bb43d6..3412f6af2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # -*- conf -*-
 
 /o
+/.prompt.jtlp
 
 # TODO: Find some way to have Python write to o/
 __pycache__
diff --git a/third_party/ggml/README.cosmo b/third_party/ggml/README.cosmo
index 95a8d5380..da55be996 100644
--- a/third_party/ggml/README.cosmo
+++ b/third_party/ggml/README.cosmo
@@ -16,7 +16,9 @@ ORIGIN
 
 LOCAL CHANGES
 
+  - Make it possible for loaded prompts to be cached to disk
   - Introduce -v and --verbose flags
+  - Reduce batch size from 512 to 32
   - Don't print stats / diagnostics unless -v is passed
   - Reduce --top_p default from 0.95 to 0.70
   - Change --reverse-prompt to no longer imply --interactive
diff --git a/third_party/ggml/common.cc b/third_party/ggml/common.cc
index 62aaa405e..f73a8f291 100644
--- a/third_party/ggml/common.cc
+++ b/third_party/ggml/common.cc
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  llama.cpp                                                                   │
diff --git a/third_party/ggml/common.h b/third_party/ggml/common.h
index cb5992535..400b91ae2 100644
--- a/third_party/ggml/common.h
+++ b/third_party/ggml/common.h
@@ -23,7 +23,7 @@ struct gpt_params {
     int32_t repeat_last_n = 64;   // last n tokens to penalize
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch       = 32;   // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters
@@ -34,6 +34,7 @@ struct gpt_params {
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
+    std::string prompt_path = ".prompt.jtlp";
     std::string input_prefix = "";       // string to prefix user inputs with
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
@@ -42,7 +43,7 @@ struct gpt_params {
 
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
-    bool use_color         = false; // use color to distinguish generations and inputs
+    bool use_color         = isatty(1) == 1; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
 
     bool embedding         = false; // get only sentence embedding
diff --git a/third_party/ggml/ggml.mk b/third_party/ggml/ggml.mk
index 06da94a58..7aa1dd907 100644
--- a/third_party/ggml/ggml.mk
+++ b/third_party/ggml/ggml.mk
@@ -72,6 +72,7 @@ THIRD_PARTY_GGML_LLAMA_DIRECTDEPS =				\
 	LIBC_NEXGEN32E						\
 	LIBC_RUNTIME						\
 	LIBC_STDIO						\
+	LIBC_LOG						\
 	LIBC_STR						\
 	LIBC_STUBS						\
 	LIBC_SYSV						\
diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc
index 8c1fb3a93..08c7eacc5 100644
--- a/third_party/ggml/llama.cc
+++ b/third_party/ggml/llama.cc
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  llama.cpp                                                                   │
@@ -25,6 +25,30 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/llama.h"
+#include "libc/intrin/bits.h"
+#include "third_party/ggml/ggml.h"
+#include "third_party/ggml/llama_util.h"
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/array"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/cassert"
+#include "third_party/libcxx/cinttypes"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/ctime"
+#include "third_party/libcxx/fstream"
+#include "third_party/libcxx/initializer_list"
+#include "third_party/libcxx/map"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/mutex"
+#include "third_party/libcxx/queue"
+#include "third_party/libcxx/random"
+#include "third_party/libcxx/sstream"
+#include "third_party/libcxx/thread"
+#include "third_party/libcxx/unordered_map"
 
 asm(".ident\t\"\\n\\n\
 llama.cpp (MIT License)\\n\
@@ -32,46 +56,9 @@ Copyright (c) 2023 Georgi Gerganov\"");
 asm(".include \"libc/disclaimer.inc\"");
 // clang-format off
 
-// Defines fileno on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#include "third_party/libcxx/cstdint"
-#include "third_party/libcxx/cstdio"
-#endif
-
-#include "third_party/ggml/llama_util.h"
-#include "third_party/ggml/llama.h"
-
-#include "third_party/ggml/ggml.h"
-
-#include "third_party/libcxx/array"
-#include "third_party/libcxx/ctime"
-#include "third_party/libcxx/cinttypes"
-#include "third_party/libcxx/fstream"
-#include "third_party/libcxx/random"
-#include "third_party/libcxx/map"
-#include "third_party/libcxx/unordered_map"
-#include "third_party/libcxx/queue"
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/climits"
-#include "third_party/libcxx/memory"
-#include "third_party/libcxx/algorithm"
-#include "third_party/libcxx/initializer_list"
-#include "third_party/libcxx/thread"
-#include "third_party/libcxx/atomic"
-#include "third_party/libcxx/mutex"
-#include "third_party/libcxx/sstream"
-
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
 
-#define READ32BE(s)                                     \
-    ((uint32_t)((const uint8_t *)(s))[0] << 030 |       \
-     (uint32_t)((const uint8_t *)(s))[1] << 020 |       \
-     (uint32_t)((const uint8_t *)(s))[2] << 010 |       \
-     (uint32_t)((const uint8_t *)(s))[3] << 000)
-
 // available llama models
 enum e_model {
     MODEL_UNKNOWN,
diff --git a/third_party/ggml/main.cc b/third_party/ggml/main.cc
index 330109dbc..697220e64 100644
--- a/third_party/ggml/main.cc
+++ b/third_party/ggml/main.cc
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  llama.cpp                                                                   │
@@ -25,6 +25,23 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/intrin/bits.h"
+#include "libc/log/log.h"
+#include "libc/nexgen32e/x86feature.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/msync.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/prot.h"
+#include "libc/sysv/consts/sig.h"
+#include "third_party/ggml/common.h"
+#include "third_party/ggml/llama.h"
+#include "third_party/ggml/llama_util.h"
+#include "third_party/libcxx/iostream"
+#include "third_party/libcxx/string"
+#include "third_party/libcxx/vector"
 
 asm(".ident\t\"\\n\\n\
 llama.cpp (MIT License)\\n\
@@ -32,62 +49,13 @@ Copyright (c) 2023 Georgi Gerganov\"");
 asm(".include \"libc/disclaimer.inc\"");
 // clang-format off
 
-// Defines sigaction on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include "third_party/ggml/common.h"
-#include "libc/nexgen32e/x86feature.h"
-#include "third_party/ggml/llama.h"
-
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cinttypes"
-#include "third_party/libcxx/cmath"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/ctime"
-#include "third_party/libcxx/fstream"
-#include "third_party/libcxx/iostream"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include "libc/calls/calls.h"
-#include "libc/calls/sigtimedwait.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/siginfo.h"
-#include "libc/sysv/consts/sa.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/ss.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/weirdtypes.h"
-#include "libc/runtime/pathconf.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/sysv/consts/f.h"
-#include "libc/sysv/consts/fileno.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/ok.h"
-#include "libc/time/time.h"
-#include "third_party/getopt/getopt.h"
-#include "third_party/musl/crypt.h"
-#include "third_party/musl/lockf.h"
-#elif defined (_WIN32)
-#include "libc/calls/calls.h"
-#include "libc/calls/sigtimedwait.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/siginfo.h"
-#include "libc/sysv/consts/sa.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/ss.h"
-#endif
-
 static console_state con_st;
 static llama_context ** g_ctx;
 
 static bool is_interacting = false;
 
+#define EPHEMERAL(fmt) "\r\e[K\033[1;35m" fmt " \033[0m"
+
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
     set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
@@ -102,6 +70,14 @@ void sigint_handler(int signo) {
 }
 #endif
 
+static int CompareTime(struct timespec a, struct timespec b) {
+  int cmp;
+  if (!(cmp = (a.tv_sec > b.tv_sec) - (a.tv_sec < b.tv_sec))) {
+    cmp = (a.tv_nsec > b.tv_nsec) - (a.tv_nsec < b.tv_nsec);
+  }
+  return cmp;
+}
+
 static int on_missing_feature(const char *name) {
     fprintf(stderr, "error: we require %s support in your microprocessor.\n", name);
     return 1;
@@ -109,6 +85,9 @@ static int on_missing_feature(const char *name) {
 
 int main(int argc, char ** argv) {
     gpt_params params;
+
+    ShowCrashReports();
+
     params.model = "models/llama-7B/ggml-model.bin";
 
     if (!X86_HAVE(AVX2)) return on_missing_feature("avx2");
@@ -167,6 +146,7 @@ int main(int argc, char ** argv) {
 //bool is_prime(int n) {)";
 
     llama_context * ctx;
+    struct stat model_stat;
     g_ctx = &ctx;
 
     // load the model
@@ -182,8 +162,9 @@ int main(int argc, char ** argv) {
 
         ctx = llama_init_from_file(params.model.c_str(), lparams, params.verbose);
 
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+        if (ctx == NULL || stat(params.model.c_str(), &model_stat)) {
+            fprintf(stderr, "%s: failed to load model: %s\n",
+                    params.model.c_str(), strerror(errno));
             return 1;
         }
     }
@@ -327,6 +308,28 @@ int main(int argc, char ** argv) {
         is_interacting = params.interactive_first;
     }
 
+    const uint32_t kJtlpMagic = READ32LE("jtlp");
+    const uint32_t kJtlpVersion = 0;
+
+    struct jtlp_header {
+        uint8_t magic[4];
+        uint8_t version[4];
+        uint8_t state_size[8];
+        uint8_t model_dev[8];
+        uint8_t model_ino[8];
+        uint8_t model_mtim_sec[8];
+        uint8_t model_mtim_nsec[8];
+        uint8_t prompt_size[8];
+    };
+
+    enum jtlp_status {
+        kPromptPending,
+        kPromptCompleted,
+        kPromptFinished
+    };
+
+    enum jtlp_status prompt_status = kPromptPending;
+
     bool is_antiprompt = false;
     bool input_noecho  = !params.verbose;
 
@@ -334,13 +337,146 @@ int main(int argc, char ** argv) {
     int n_remain   = params.n_predict;
     int n_consumed = 0;
 
-    // the first thing we will do is to output the prompt, so set color accordingly
-    set_console_color(con_st, CONSOLE_COLOR_PROMPT);
+    // instantly reload prompt if it's cached
+    int fd = open(params.prompt_path.c_str(), O_RDONLY);
+    if (fd != -1) {
+        size_t state_size;
+        size_t prompt_size;
+        struct timespec mtim;
+        struct jtlp_header *header;
+        off_t rc = lseek(fd, 0, SEEK_END);
+        LLAMA_ASSERT(rc != -1);
+        void *map = MAP_FAILED;
+        size_t file_size = rc;
+        if (file_size < sizeof(header)) {
+            fprintf(stderr, "%s: prompt file too small\n",
+                    params.prompt_path.c_str());
+            goto CantReloadPrompt;
+        }
+        map = mmap(0, file_size, PROT_READ, MAP_SHARED, fd, 0);
+        if (map == MAP_FAILED) {
+            fprintf(stderr, "%s: mmap failed: %s\n",
+                    params.prompt_path.c_str(), strerror(errno));
+            goto CantReloadPrompt;
+        }
+        header = (struct jtlp_header *)map;
+        // check file format magic
+        if (READ32LE(header->magic) != kJtlpMagic) {
+            fprintf(stderr, "%s: prompt file has wrong magic\n",
+                    params.prompt_path.c_str());
+            goto CantReloadPrompt;
+        }
+        // check file format version
+        if (READ32LE(header->version) > kJtlpVersion) {
+            fprintf(stderr, "%s: prompt has future file format version\n",
+                    params.prompt_path.c_str());
+            goto CantReloadPrompt;
+        }
+        // check expected state size
+        state_size = llama_get_state_size(ctx);
+        if (READ64LE(header->state_size) != state_size) {
+            if (params.verbose) {
+                fprintf(stderr, "%s: prompt has stale data state size\n",
+                        params.prompt_path.c_str());
+            }
+            goto CantReloadPrompt;
+        }
+        // check model device id
+        if (READ64LE(header->model_dev) != model_stat.st_dev) {
+            fprintf(stderr, "%s: prompt is for different model (dev)\n",
+                    params.prompt_path.c_str());
+            goto CantReloadPrompt;
+        }
+        // check model inode id
+        if (READ64LE(header->model_ino) != model_stat.st_ino) {
+            fprintf(stderr, "%s: prompt is for different model (ino)\n",
+                    params.prompt_path.c_str());
+            goto CantReloadPrompt;
+        }
+        // check model modified timestamp
+        mtim.tv_sec = READ64LE(header->model_mtim_sec);
+        mtim.tv_nsec = READ64LE(header->model_mtim_nsec);
+        if (CompareTime(model_stat.st_mtim, mtim) > 0) {
+            if (params.verbose) {
+                fprintf(stderr, "%s: model file timestamp changed; will reload and regenerate prompt\n",
+                        params.prompt_path.c_str());
+            }
+            goto CantReloadPrompt;
+        }
+        // check prompt file size
+        prompt_size = READ64LE(header->prompt_size);
+        if (sizeof(struct jtlp_header) + prompt_size + state_size > file_size) {
+            fprintf(stderr, "%s: prompt file size unexpected\n",
+                    params.prompt_path.c_str());
+            goto CantReloadPrompt;
+        }
+        // check prompt textus
+        if (prompt_size != params.prompt.size() ||
+            memcmp(header + 1, params.prompt.c_str(), prompt_size) != 0) {
+            if (params.verbose) {
+                fprintf(stderr, "%s: prompt text changed; will reload and regenerate\n",
+                        params.prompt_path.c_str());
+            }
+            goto CantReloadPrompt;
+        }
+        // read the transformer state
+        llama_set_state_data(ctx, (uint8_t *)(header + 1) + prompt_size);
+        // we're finished loading the prompt file
+        if (params.verbose) {
+            fprintf(stderr, "%s: %s: reloaded previously saved prompt\n",
+                    __func__, params.prompt_path.c_str());
+        }
+        // now setup the business logic
+        llama_set_rng_seed(ctx, params.seed);
+        while ((int) embd_inp.size() > n_consumed) {
+            last_n_tokens.erase(last_n_tokens.begin());
+            last_n_tokens.push_back(embd_inp[n_consumed++]);
+        }
+        n_past = n_consumed;
+        prompt_status = kPromptFinished;
+        if (params.interactive) {
+            is_interacting = true;
+            fflush(stdout);
+            std::string last_output;
+            for (auto id : last_n_tokens) {
+                last_output += llama_token_to_str(ctx, id);
+            }
+            for (std::string & antiprompt : params.antiprompt) {
+                if (last_output.find(antiprompt.c_str(),
+                                     last_output.length() - antiprompt.length(),
+                                     antiprompt.length()) != std::string::npos) {
+                    set_console_color(con_st, CONSOLE_COLOR_PROMPT);
+                    printf("%s", antiprompt.c_str());
+                    fflush(stdout);
+                    break;
+                }
+            }
+            set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
+        }
+  CantReloadPrompt:
+        if (map != MAP_FAILED) {
+            munmap(map, file_size);
+        }
+        close(fd);
+    }
+
+    if (prompt_status == kPromptPending && params.verbose) {
+        // the first thing we will do is to output the prompt, so set color accordingly
+        set_console_color(con_st, CONSOLE_COLOR_PROMPT);
+    }
 
     std::vector<llama_token> embd;
 
+    if (prompt_status == kPromptPending &&
+        !params.verbose && con_st.use_color) {
+        fprintf(stderr, EPHEMERAL("loading model..."));
+        fflush(stderr);
+    }
+
     while (n_remain != 0 || params.interactive) {
-        // predict
+
+        // performance inference evaluation of scheduled tokens
+        // this loads prompt tokens and it also does prediction
         if (embd.size() > 0) {
             // infinite text generation via context swapping
             // if we run out of context:
@@ -375,11 +511,102 @@ int main(int argc, char ** argv) {
                     return 1;
                 }
                 n_past += n_eval;
+                if (prompt_status == kPromptPending &&
+                    !params.verbose && con_st.use_color && embd_inp.size()) {
+                    fprintf(stderr, EPHEMERAL("loading prompt %d%% ..."),
+                            (int)(n_consumed / (double)embd_inp.size() * 100));
+                    fflush(stderr);
+                }
             }
         }
 
         embd.clear();
 
+        // save prompt to disk atomically as soon as it's finished loading
+        bool was_completed = prompt_status == kPromptCompleted;
+        if (was_completed && !params.prompt_path.empty()) {
+            int fd = -1;
+            int close_rc;
+            uint8_t buf[8];
+            size_t file_size;
+            size_t state_size;
+            std::string tmppath;
+            void *map = MAP_FAILED;
+            struct jtlp_header header;
+            if (!params.verbose && con_st.use_color) {
+                fprintf(stderr, EPHEMERAL("caching prompt..."));
+                fflush(stderr);
+            }
+            state_size = llama_get_state_size(ctx);
+            WRITE32LE(header.magic, kJtlpMagic);
+            WRITE32LE(header.version, kJtlpVersion);
+            WRITE64LE(header.state_size, state_size);
+            WRITE64LE(header.model_dev, model_stat.st_dev);
+            WRITE64LE(header.model_ino, model_stat.st_ino);
+            WRITE64LE(header.model_mtim_sec, model_stat.st_mtim.tv_sec);
+            WRITE64LE(header.model_mtim_nsec, model_stat.st_mtim.tv_nsec);
+            WRITE64LE(header.prompt_size, params.prompt.size());
+            file_size = sizeof(header) + params.prompt.size() + state_size;
+            tmppath.append(params.prompt_path);
+            tmppath.append(".XXXXXX");
+            fd = mkstemp(&tmppath[0]);
+            if (fd == -1) {
+                fprintf(stderr, "%s: mkstemp failed: %s\n",
+                        tmppath.c_str(), strerror(errno));
+                goto CouldNotSavePrompt;
+            }
+            if (ftruncate(fd, file_size)) {
+                fprintf(stderr, "%s: ftruncate failed: %s\n",
+                        tmppath.c_str(), strerror(errno));
+                goto CouldNotSavePrompt;
+            }
+            map = mmap(0, file_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+            if (map == MAP_FAILED) {
+                fprintf(stderr, "%s: mmap failed: %s\n",
+                        tmppath.c_str(), strerror(errno));
+                goto CouldNotSavePrompt;
+            }
+            llama_copy_state_data(ctx, (uint8_t *)map + sizeof(header) + params.prompt.size());
+            memcpy((uint8_t *)map + sizeof(header), params.prompt.c_str(), params.prompt.size());
+            memcpy(map, &header, sizeof(header));
+            if (msync(map, file_size, MS_ASYNC) && params.verbose) {
+                fprintf(stderr, "%s: msync failed: %s\n",
+                        tmppath.c_str(), strerror(errno));
+            }
+            if (munmap(map, file_size) && params.verbose) {
+                fprintf(stderr, "%s: munmap failed: %s\n",
+                        tmppath.c_str(), strerror(errno));
+            }
+            map = MAP_FAILED;
+            close_rc = close(fd);
+            fd = -1;
+            if (close_rc) {
+                fprintf(stderr, "%s: close failed: %s\n",
+                        tmppath.c_str(), strerror(errno));
+                goto CouldNotSavePrompt;
+            }
+            if (rename(tmppath.c_str(), params.prompt_path.c_str())) {
+                fprintf(stderr, "%s -> %s: rename failed: %s\n",
+                        tmppath.c_str(), params.prompt_path.c_str(), strerror(errno));
+                goto CouldNotSavePrompt;
+            }
+            tmppath.clear();
+      CouldNotSavePrompt:
+            if (map != MAP_FAILED) munmap(map, file_size);
+            if (fd != -1) close(fd);
+            if (!tmppath.empty()) unlink(tmppath.c_str());
+        }
+        if (was_completed) {
+            if (!params.verbose && con_st.use_color) {
+                fprintf(stderr, EPHEMERAL(""));
+                fflush(stderr);
+            }
+            if (params.interactive) {
+                is_interacting = true;
+            }
+            prompt_status = kPromptFinished;
+        }
+
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
             const int32_t top_k          = params.top_k;
@@ -422,17 +649,23 @@ int main(int argc, char ** argv) {
 
             // decrement remaining sampling budget
             --n_remain;
+
         } else {
             // some user input remains from prompt or interaction, forward it to processing
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
                 last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
-                ++n_consumed;
+                last_n_tokens.push_back(embd_inp[n_consumed++]);
                 if ((int) embd.size() >= params.n_batch) {
                     break;
                 }
             }
+
+            // we've nearly finished loading the prompt
+            if (prompt_status == kPromptPending &&
+                (int) embd_inp.size() <= n_consumed) {
+                prompt_status = kPromptCompleted;
+            }
         }
 
         // checks for reverse prompt
@@ -476,6 +709,10 @@ int main(int argc, char ** argv) {
             }
             fflush(stdout);
         }
+        if (prompt_status == kPromptCompleted) {
+            continue;  // avoid reading line before last token loads
+        }
+
         // reset color to default if we there is no pending user input
         if (params.verbose && !input_noecho && (int)embd_inp.size() == n_consumed) {
             set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
@@ -521,6 +758,7 @@ int main(int argc, char ** argv) {
                     }
                     win32_utf8_encode(wline, line);
 #else
+                    fflush(stdout);
                     if (!std::getline(std::cin, line)) {
                         // input stream is bad or EOF received
                         return 0;