From ea10d3ded2994106596ddf8e4ed02741b3e053e6 Mon Sep 17 00:00:00 2001
From: anzz1 <anzz1@live.com>
Date: Thu, 23 Mar 2023 19:54:28 +0200
Subject: [PATCH 01/20] Command line args bounds checking (#424)

* command line args bounds checking

* unknown and invalid param exit codes 0 -> 1
---
 utils.cpp | 101 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 17 deletions(-)
diff --git a/utils.cpp b/utils.cpp
index 1d5309c3a..45c9cabb1 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -26,41 +26,95 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
     }
 
+    bool invalid_param = false;
+    std::string arg;
     for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
+        arg = argv[i];
 
         if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seed = std::stoi(argv[i]);
         } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
         } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prompt = argv[i];
         } else if (arg == "-f" || arg == "--file") {
-            std::ifstream file(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
             if (params.prompt.back() == '\n') {
                 params.prompt.pop_back();
             }
         } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_predict = std::stoi(argv[i]);
         } else if (arg == "--top_k") {
-            params.top_k = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx_size") {
-            params.n_ctx = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ctx = std::stoi(argv[i]);
         } else if (arg == "--memory_f16") {
             params.memory_f16 = true;
         } else if (arg == "--top_p") {
-            params.top_p = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
-            params.temp = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.temp = std::stof(argv[i]);
         } else if (arg == "--repeat_last_n") {
-            params.repeat_last_n = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat_penalty") {
-            params.repeat_penalty = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_batch = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--interactive-first") {
@@ -70,13 +124,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--color") {
             params.use_color = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
-            params.antiprompt.push_back(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.antiprompt.push_back(argv[i]);
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--n_parts") {
-            params.n_parts = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_parts = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             gpt_print_usage(argc, argv, params);
             exit(0);
@@ -85,9 +147,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             gpt_print_usage(argc, argv, params);
-            exit(0);
+            exit(1);
         }
     }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        gpt_print_usage(argc, argv, params);
+        exit(1);
+    }
 
     return true;
 }

From ad072fc5ad6f6905a7224ff6ea07c0644aa075b1 Mon Sep 17 00:00:00 2001
From: nusu-github <29514220+nusu-github@users.noreply.github.com>
Date: Fri, 24 Mar 2023 05:16:48 +0900
Subject: [PATCH 02/20] Generate library with CMake (#430)

* Generate library with CMake

BUILD_SHARED_LIBS to allow llama library to be generated.

* Turn ON PIC when BUILD_SHARED_LIBS is ON
---
 CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d952afb4f..51af97c4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -218,6 +218,9 @@ add_library(utils OBJECT
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
 target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
 
 add_library(ggml OBJECT
             ggml.c
@@ -226,6 +229,9 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
 
 add_library(llama
             llama.cpp
@@ -234,6 +240,10 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+endif()
 
 #
 # Executables

From 20a1a4e09c522a80e2a0db51643d25fa38326065 Mon Sep 17 00:00:00 2001
From: Timmy Knight <r2d2fish@gmail.com>
Date: Thu, 23 Mar 2023 10:18:13 -1000
Subject: [PATCH 03/20] Fix GPTQ converter (#423)

* Fix GPTQ converter

* Fix comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-gptq-to-ggml.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py
index 7fccb4d56..6c77808fc 100644
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -36,7 +36,8 @@ fname_out = sys.argv[3]
 
 fout = open(fname_out, "wb")
 
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
+fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
@@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
     if tokenizer.is_unknown(i):
-        # "<unk>" token (translated as ??)
         text = " \u2047 ".encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
     elif tokenizer.is_control(i):
-        # "<s>"/"</s>" tokens
-        fout.write(struct.pack("i", 0))
+        text = b""
     elif tokenizer.is_byte(i):
-        # "<U+XX>" tokens (which may be invalid UTF-8)
         piece = tokenizer.id_to_piece(i)
         if len(piece) != 6:
-            print("Invalid token: " + piece)
+            print(f"Invalid token: {piece}")
             sys.exit(1)
         byte_value = int(piece[3:-1], 16)
-        fout.write(struct.pack("i", 1))
-        fout.write(struct.pack("B", byte_value))
+        text = struct.pack("B", byte_value)
     else:
-        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
         text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    fout.write(struct.pack("f", tokenizer.get_score(i)))
 
 def write_header(shape, dst_name, ftype_cur):
     sname = dst_name.encode('utf-8')

From 2e17dfd80a473099dacc0f41c9146d233c6a5972 Mon Sep 17 00:00:00 2001
From: rabidcopy <rabidcopy@yahoo.com>
Date: Thu, 23 Mar 2023 15:22:47 -0500
Subject: [PATCH 04/20] Replace EOS with newline to prevent context/memory
 being flushed by EOS in interactive mode (#333)

* Improve interactive mode's coherence after EOS

Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached.
Not sure what token 13 is or why it seems to help. See conversation for examples.

* Make newline token a constant

* dynamically determine newline token

* relocate previous newline token const

* cleanup whitespace

* print a new line on end of text in interactive

this may need to be looked into further when not using a reverse prompt

* only print manual newline with reverse prompt

fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise

* alternate approach to replace end of text tokens

* Inject the reverse prompt again after eos in interactive mode

* tokenize reverse prompt when needed

makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330

* tokenize and inject only first reverse prompt

thanks to tjohnman

* tokenize first reverse prompt once

* add newline token

* add newline token

* tokenize/inject reverse prompt for refactor

this doesn't seem right though

* tokenize nothing for antiprompt if no reverse

* Update main.cpp

* Update main.cpp

* tokenize and inject reverse prompt as needed

this doesn't seem to work if the reverse prompt is tokenized outside earlier on

* not needed

* remove newline token

* remove newline token

* tokenize newline token

* add space to comment

* Update main.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 main.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/main.cpp b/main.cpp
index 431c94b52..5ba6d5a75 100644
--- a/main.cpp
+++ b/main.cpp
@@ -258,6 +258,9 @@ int main(int argc, char ** argv) {
         params.interactive = true;
     }
 
+    // determine newline token
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
     fprintf(stderr, "\n");
     fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
     fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@@ -359,6 +362,16 @@ int main(int argc, char ** argv) {
                 last_n_tokens.push_back(id);
             }
 
+            // replace end of text token with newline token when in interactive mode
+            if (id == llama_token_eos() && params.interactive) {
+                id = llama_token_newline.front();
+                if (params.antiprompt.size() != 0) {
+                    // tokenize and inject first reverse prompt
+                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                }
+            }
+
             // add it to the context
             embd.push_back(id);
 
@@ -451,12 +464,8 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (embd.back() == llama_token_eos()) {
-            if (params.interactive) {
-                is_interacting = true;
-            } else {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
+            fprintf(stderr, " [end of text]\n");
+            break;
         }
 
         // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.

From 0ba5a3a9a5efedb1aeecbbc70a4e9825542472d5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 22:32:02 +0200
Subject: [PATCH 05/20] Obsolete

---
 download-pth.py | 66 -------------------------------------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 download-pth.py

diff --git a/download-pth.py b/download-pth.py
deleted file mode 100644
index 129532c0c..000000000
--- a/download-pth.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-import sys
-from tqdm import tqdm
-import requests
-
-if len(sys.argv) < 3:
-    print("Usage: download-pth.py dir-model model-type\n")
-    print("  model-type: Available models 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-modelsDir = sys.argv[1]
-model = sys.argv[2]
-
-num = {
-    "7B": 1,
-    "13B": 2,
-    "30B": 4,
-    "65B": 8,
-}
-
-if model not in num:
-    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-print(f"Downloading model {model}")
-
-files = ["checklist.chk", "params.json"]
-
-for i in range(num[model]):
-    files.append(f"consolidated.0{i}.pth")
-
-resolved_path = os.path.abspath(os.path.join(modelsDir, model))
-os.makedirs(resolved_path, exist_ok=True)
-
-for file in files:
-    dest_path = os.path.join(resolved_path, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-
-    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
-
-files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
-for file in files2:
-    dest_path = os.path.join(modelsDir, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-    
-    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
\ No newline at end of file

From 4cc053b6d5e9df7ac21fa06b7208a70c156d4d7a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 22:39:44 +0200
Subject: [PATCH 06/20] Remove oboslete command from Docker script

---
 .devops/tools.sh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.devops/tools.sh b/.devops/tools.sh
index 352e04942..b0196b60d 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
     ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
     ./main $arg2
-elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
-    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
-    echo "Downloading model..."
-    python3 ./download-pth.py "$1" "$2"
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
         if [ -f "${i/f16/q4_0}" ]; then
@@ -39,8 +35,6 @@ else
     echo "              ex: \"/models/7B/\" 1"
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
-    echo "              ex: \"/models/\" 7B"
-    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
+    echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
 fi

From 404e1da38ec8025707031a8027da14dc1590f952 Mon Sep 17 00:00:00 2001
From: Jed Fox <git@jedfox.com>
Date: Thu, 23 Mar 2023 16:42:52 -0400
Subject: [PATCH 07/20] Fix quantize script not finding models in parent
 directory (#428)

---
 quantize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quantize.py b/quantize.py
index 6320b0a26..16b5963d3 100644
--- a/quantize.py
+++ b/quantize.py
@@ -57,6 +57,7 @@ def main():
     # )
 
     args = parser.parse_args()
+    args.models_path = os.path.abspath(args.models_path)
 
     if not os.path.isfile(args.quantize_script_path):
         print(

From 483bab2e3d4a868fe679d8bb32827d2a4df214dc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 23:22:01 +0200
Subject: [PATCH 08/20] Avoid the transposed X branch in the Z = X * Y matrix
 multiplication (#439)

Should make results reproducible for different number of threads and batch sizes
---
 llama.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 7de3c19c8..d55219256 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,11 +727,13 @@ static bool llama_eval_internal(
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                ggml_cpy(ctx0,
+                    ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
 
             // KQV = transpose(V) * KQ_soft_max
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

From 4870e455b3653f7d7769fa5772b2c90ffad088df Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 00:11:53 +0200
Subject: [PATCH 09/20] Fix memory allocation issues and seg faults

---
 llama.cpp | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d55219256..cf796cce3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,6 +102,9 @@ struct llama_context {
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
+
+    // work buffer for transformer evaluation
+    std::vector<uint8_t> buf_eval;
 };
 
 struct llama_context_params llama_context_default_params() {
@@ -627,27 +630,19 @@ static bool llama_eval_internal(
     const int n_rot   = hparams.n_embd/hparams.n_head;
 
     auto & mem_per_token = lctx.mem_per_token;
+    auto & buf_eval = lctx.buf_eval;
 
-    // TODO: fix this hardcoded size
-    static size_t buf_size = 512u*1024*1024;
-    static void * buf = malloc(buf_size);
+    if (mem_per_token*(n_past + N + 16) > buf_eval.size()) {
+        const size_t buf_size_new = 1.618*buf_eval.size();
 
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
-        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new);
 
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
+        buf_eval.resize(buf_size_new);
     }
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
+        /*.mem_size   =*/ buf_eval.size(),
+        /*.mem_buffer =*/ buf_eval.data(),
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -832,10 +827,11 @@ static bool llama_eval_internal(
         memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
     }
 
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
+    if (N == 1) {
+        mem_per_token = ggml_used_mem(ctx0)/(n_past + N);
     }
-    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024);
 
     ggml_free(ctx0);
 
@@ -1416,6 +1412,8 @@ struct llama_context * llama_init_from_file(
         return nullptr;
     }
 
+    ctx->buf_eval.resize(512u*1024u*1024u);
+
     return ctx;
 }
 

From 3cd8dde0d1357b7f11bdd25c45d5bf5e97e284a0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 06:22:28 +0200
Subject: [PATCH 10/20] Revert "Fix memory allocation issues and seg faults"

This reverts commit 4870e455b3653f7d7769fa5772b2c90ffad088df.

Will provide the correct fix later
---
 llama.cpp | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cf796cce3..d55219256 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,9 +102,6 @@ struct llama_context {
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
-
-    // work buffer for transformer evaluation
-    std::vector<uint8_t> buf_eval;
 };
 
 struct llama_context_params llama_context_default_params() {
@@ -630,19 +627,27 @@ static bool llama_eval_internal(
     const int n_rot   = hparams.n_embd/hparams.n_head;
 
     auto & mem_per_token = lctx.mem_per_token;
-    auto & buf_eval = lctx.buf_eval;
 
-    if (mem_per_token*(n_past + N + 16) > buf_eval.size()) {
-        const size_t buf_size_new = 1.618*buf_eval.size();
+    // TODO: fix this hardcoded size
+    static size_t buf_size = 512u*1024*1024;
+    static void * buf = malloc(buf_size);
 
-        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new);
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
+        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
-        buf_eval.resize(buf_size_new);
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
     }
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_eval.size(),
-        /*.mem_buffer =*/ buf_eval.data(),
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -827,11 +832,10 @@ static bool llama_eval_internal(
         memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
     }
 
-    if (N == 1) {
-        mem_per_token = ggml_used_mem(ctx0)/(n_past + N);
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
     }
-
-    //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024);
+    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
 
     ggml_free(ctx0);
 
@@ -1412,8 +1416,6 @@ struct llama_context * llama_init_from_file(
         return nullptr;
     }
 
-    ctx->buf_eval.resize(512u*1024u*1024u);
-
     return ctx;
 }
 

From b6b268d4415fd3b3e53f22b6619b724d4928f713 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 09:13:35 +0200
Subject: [PATCH 11/20] Add link to Roadmap discussion

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ee8dc1dcb..06799b5b3 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
 - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
-- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
 

From 8d4a855c241ecb0f3ddc03447fe56002ebf27a37 Mon Sep 17 00:00:00 2001
From: Luciano <lucianostrika44@gmail.com>
Date: Fri, 24 Mar 2023 08:05:13 -0700
Subject: [PATCH 12/20] Add embedding mode with arg flag. Currently working
 (#282)

* working but ugly

* add arg flag, not working on embedding mode

* typo

* Working! Thanks to @nullhook

* make params argument instead of hardcoded boolean. remove useless time check

* start doing the instructions but not finished. This probably doesnt compile

* Embeddings extraction support

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 56 +++++++++++++++++++++++++++++++++++++++++++++----------
 llama.h   |  5 +++++
 main.cpp  | 23 +++++++++++++++++++++++
 utils.cpp |  4 ++++
 utils.h   |  4 ++++
 5 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d55219256..d8c771529 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,6 +102,9 @@ struct llama_context {
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
+
+    // input embedding (1-dimensional array: [n_embd])
+    std::vector<float> embedding;
 };
 
 struct llama_context_params llama_context_default_params() {
@@ -112,6 +115,7 @@ struct llama_context_params llama_context_default_params() {
         /*.f16_kv     =*/ false,
         /*.logits_all =*/ false,
         /*.vocab_only =*/ false,
+        /*.embedding  =*/ false,
     };
 
     return result;
@@ -592,8 +596,6 @@ static bool llama_model_load(
         fin.close();
     }
 
-    lctx.logits.reserve(lctx.model.hparams.n_ctx);
-
     lctx.t_load_us = ggml_time_us() - t_start_us;
 
     return true;
@@ -791,6 +793,9 @@ static bool llama_eval_internal(
         inpL = cur;
     }
 
+    // used at the end to optionally extract the embeddings
+    struct ggml_tensor * embeddings = NULL;
+
     // norm
     {
         inpL = ggml_rms_norm(ctx0, inpL);
@@ -799,6 +804,8 @@ static bool llama_eval_internal(
         inpL = ggml_mul(ctx0,
                     ggml_repeat(ctx0, model.norm, inpL),
                     inpL);
+
+        embeddings = inpL;
     }
 
     // lm_head
@@ -821,15 +828,26 @@ static bool llama_eval_internal(
     //embd_w.resize(n_vocab*N);
     //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
 
-    auto & logits_out = lctx.logits;
+    // extract logits
+    {
+        auto & logits_out = lctx.logits;
 
-    if (lctx.logits_all) {
-        logits_out.resize(n_vocab * N);
-        memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-    } else {
-        // return result for just the last token
-        logits_out.resize(n_vocab);
-        memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+        if (lctx.logits_all) {
+            logits_out.resize(n_vocab * N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+        } else {
+            // return result for just the last token
+            logits_out.resize(n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+        }
+    }
+
+    // extract embeddings
+    if (lctx.embedding.size()) {
+        auto & embedding_out = lctx.embedding;
+
+        embedding_out.resize(n_embd);
+        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
     }
 
     if (mem_per_token == 0) {
@@ -1416,6 +1434,20 @@ struct llama_context * llama_init_from_file(
         return nullptr;
     }
 
+    // reserve memory for context buffers
+    {
+        const auto & hparams = ctx->model.hparams;
+        if (params.logits_all) {
+            ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
+        } else {
+            ctx->logits.reserve(hparams.n_ctx);
+        }
+
+        if (params.embedding){
+            ctx->embedding.reserve(hparams.n_embd);
+        }
+    }
+
     return ctx;
 }
 
@@ -1484,6 +1516,10 @@ float * llama_get_logits(struct llama_context * ctx) {
     return ctx->logits.data();
 }
 
+float * llama_get_embeddings(struct llama_context * ctx) {
+    return ctx->embedding.data();
+}
+
 const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
     if (token >= llama_n_vocab(ctx)) {
         return nullptr;
diff --git a/llama.h b/llama.h
index 3df9ed1fd..209b4dbe8 100644
--- a/llama.h
+++ b/llama.h
@@ -53,6 +53,7 @@ extern "C" {
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool vocab_only; // only load the vocabulary, no weights
+        bool embedding;  // embedding mode only
     };
 
     LLAMA_API struct llama_context_params llama_context_default_params();
@@ -108,6 +109,10 @@ extern "C" {
     // Cols: n_vocab
     LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 
+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+
     // Token Id -> String. Uses the vocabulary in the provided context
     LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
 
diff --git a/main.cpp b/main.cpp
index 5ba6d5a75..46a80ff87 100644
--- a/main.cpp
+++ b/main.cpp
@@ -199,6 +199,7 @@ int main(int argc, char ** argv) {
         lparams.seed       = params.seed;
         lparams.f16_kv     = params.memory_f16;
         lparams.logits_all = params.perplexity;
+        lparams.embedding  = params.embedding;
 
         ctx = llama_init_from_file(params.model.c_str(), lparams);
 
@@ -292,6 +293,7 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd;
 
+
     int last_n_size = params.repeat_last_n;
     std::vector<llama_token> last_n_tokens(last_n_size);
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
@@ -324,6 +326,27 @@ int main(int argc, char ** argv) {
     // the first thing we will do is to output the prompt, so set color accordingly
     set_console_state(CONSOLE_STATE_PROMPT);
 
+    if (params.embedding){
+        embd = embd_inp;
+
+        if (embd.size() > 0) {
+            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return 1;
+            }
+        }
+
+        const auto embeddings = llama_get_embeddings(ctx);
+
+        // TODO: print / use the embeddings
+
+        if (params.use_color) {
+            printf(ANSI_COLOR_RESET);
+        }
+
+        return 0;
+    }
+
     while (remaining_tokens > 0 || params.interactive) {
         // predict
         if (embd.size() > 0) {
diff --git a/utils.cpp b/utils.cpp
index 45c9cabb1..0df89af0b 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -117,6 +117,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.model = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
+        } else if (arg == "--embedding") {
+            params.embedding = true;
+        } else if (arg == "--interactive-start") {
+            params.interactive = true;
         } else if (arg == "--interactive-first") {
             params.interactive_start = true;
         } else if (arg == "-ins" || arg == "--instruct") {
diff --git a/utils.h b/utils.h
index b0de556c9..8120c123b 100644
--- a/utils.h
+++ b/utils.h
@@ -32,13 +32,17 @@ struct gpt_params {
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
 
+
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
     bool memory_f16        = false; // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
+
+    bool embedding         = false; // get only sentence embedding
     bool interactive_start = false; // wait for user input immediately
+
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool ignore_eos        = false; // do not stop generating after eos
     bool perplexity        = false; // compute perplexity over the prompt

From 563cdc391dde140f1084d1012234e8e6f57f881f Mon Sep 17 00:00:00 2001
From: comex <comexk@gmail.com>
Date: Fri, 24 Mar 2023 08:19:05 -0700
Subject: [PATCH 13/20] Support calling mlock() on loaded model data on Linux
 and macOS (#453)

* Support calling mlock() on loaded model data on Linux and macOS

This is enabled by a new --mlock command line option.

Using mlock() disables swapping and memory compression for the model
data.  Doing so can be useful on systems where the model takes up a
large fraction of system RAM.  In my experience, macOS is quite eager to
start compressing llama.cpp's memory, which then makes it halt for a few
seconds while it decompresses, even with a model that uses "only" 25GB
out of 32GB.

Of course, this comes at the cost of forcing the system to swap or
compress other processes' memory instead, so it needs to be used with
care and shouldn't be enabled by default.

In theory it should be possible to support this on Windows as well using
VirtualLock(), but I'm not much of a Windows user.

* Update llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml.c    | 76 +++++++++++++++++++++++++++++++++++++++++++++++--------
 ggml.h    |  3 +++
 llama.cpp | 14 +++++++++-
 llama.h   |  1 +
 main.cpp  |  1 +
 utils.cpp |  7 +++++
 utils.h   |  1 +
 7 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0e4b1466c..800390a88 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1,5 +1,5 @@
-// Defines CLOCK_MONOTONIC on Linux
-#define _POSIX_C_SOURCE 199309L
+// Defines CLOCK_MONOTONIC and asprintf on Linux
+#define _GNU_SOURCE
 
 #include "ggml.h"
 
@@ -10,6 +10,7 @@
 #endif
 
 #include <assert.h>
+#include <errno.h>
 #include <time.h>
 #include <math.h>
 #include <stdlib.h>
@@ -31,7 +32,6 @@
 #else
 // ref: https://github.com/ggerganov/whisper.cpp/issues/168
 #include <windows.h>
-#include <errno.h>
 #endif
 
 typedef volatile LONG atomic_int;
@@ -83,6 +83,17 @@ typedef void* thread_ret_t;
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #endif
 
+#define GGML_MLOCK_SUPPORT 0
+
+#ifdef __has_include
+    #if __has_include(<sys/mman.h>)
+        #undef GGML_MLOCK_SUPPORT
+        #define GGML_MLOCK_SUPPORT 1
+        #include <sys/mman.h>
+    #endif
+#endif
+
+
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
@@ -2344,6 +2355,7 @@ struct ggml_context {
     size_t mem_size;
     void * mem_buffer;
     bool   mem_buffer_owned;
+    bool   mem_buffer_mlocked;
 
     int n_objects;
 
@@ -2619,16 +2631,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
     }
 
     *ctx = (struct ggml_context) {
-        /*.mem_size         =*/ params.mem_size,
-        /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
-        /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
-        /*.n_objects        =*/ 0,
-        /*.objects_begin    =*/ NULL,
-        /*.objects_end      =*/ NULL,
-        /*.scratch          =*/ { 0, 0, NULL, },
-        /*.scratch_save     =*/ { 0, 0, NULL, },
+        /*.mem_size           =*/ params.mem_size,
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
+        /*.mem_buffer_mlocked =*/ false,
+        /*.n_objects          =*/ 0,
+        /*.objects_begin      =*/ NULL,
+        /*.objects_end        =*/ NULL,
+        /*.scratch            =*/ { 0, 0, NULL, },
+        /*.scratch_save       =*/ { 0, 0, NULL, },
     };
 
+    GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
+
     ggml_assert_aligned(ctx->mem_buffer);
 
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
@@ -2651,6 +2666,14 @@ void ggml_free(struct ggml_context * ctx) {
             GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
                     __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
 
+#if GGML_MLOCK_SUPPORT
+            if (ctx->mem_buffer_mlocked) {
+                if (munlock(ctx->mem_buffer, ctx->mem_size)) {
+                    fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
+                }
+            }
+#endif
+
             if (ctx->mem_buffer_owned) {
                 free(ctx->mem_buffer);
             }
@@ -2679,6 +2702,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
     return result;
 }
 
+bool ggml_mlock_supported(void) {
+    return GGML_MLOCK_SUPPORT;
+}
+
+#if GGML_MLOCK_SUPPORT
+#ifdef __APPLE__
+    #define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
+                             "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l)."
+#else
+    #define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
+#endif
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
+    if (ctx->mem_buffer_mlocked) {
+        return true;
+    }
+    if (mlock(ctx->mem_buffer, ctx->mem_size)) {
+        int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
+                           ctx->mem_size, strerror(errno));
+        GGML_ASSERT(ret >= 0);
+        return false;
+    }
+    ctx->mem_buffer_mlocked = true;
+    return true;
+}
+#else // GGML_MLOCK_SUPPORT
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
+    *err_p = strdup("can't mlock because it's not supported on this system");
+    return false;
+}
+#endif // GGML_MLOCK_SUPPORT
+
 ////////////////////////////////////////////////////////////////////////////////
 
 struct ggml_tensor * ggml_new_tensor_impl(
diff --git a/ggml.h b/ggml.h
index c7e6814a8..ddb97318b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
 
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
 
+bool ggml_mlock_supported(void);
+bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
+
 struct ggml_tensor * ggml_new_tensor(
         struct ggml_context * ctx,
         enum   ggml_type type,
diff --git a/llama.cpp b/llama.cpp
index d8c771529..5d56cc90e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -115,6 +115,7 @@ struct llama_context_params llama_context_default_params() {
         /*.f16_kv     =*/ false,
         /*.logits_all =*/ false,
         /*.vocab_only =*/ false,
+        /*.use_mlock  =*/ false,
         /*.embedding  =*/ false,
     };
 
@@ -1428,11 +1429,22 @@ struct llama_context * llama_init_from_file(
 
     ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
+                          params.vocab_only)) {
         fprintf(stderr, "%s: failed to load model\n", __func__);
         delete ctx;
         return nullptr;
     }
+    
+    if (params.use_mlock) {
+        char *err;
+        if (!ggml_mlock(ctx->model.ctx, &err)) {
+            fprintf(stderr, "%s\n", err);
+            free(err);
+            delete ctx;
+            return nullptr;
+        }
+    }
 
     // reserve memory for context buffers
     {
diff --git a/llama.h b/llama.h
index 209b4dbe8..9943d96ba 100644
--- a/llama.h
+++ b/llama.h
@@ -53,6 +53,7 @@ extern "C" {
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool vocab_only; // only load the vocabulary, no weights
+        bool use_mlock;  // force system to keep model in RAM
         bool embedding;  // embedding mode only
     };
 
diff --git a/main.cpp b/main.cpp
index 46a80ff87..39dfc575b 100644
--- a/main.cpp
+++ b/main.cpp
@@ -199,6 +199,7 @@ int main(int argc, char ** argv) {
         lparams.seed       = params.seed;
         lparams.f16_kv     = params.memory_f16;
         lparams.logits_all = params.perplexity;
+        lparams.use_mlock  = params.use_mlock;
         lparams.embedding  = params.embedding;
 
         ctx = llama_init_from_file(params.model.c_str(), lparams);
diff --git a/utils.cpp b/utils.cpp
index 0df89af0b..10673fb82 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -1,3 +1,5 @@
+#include "ggml.h"
+
 #include "utils.h"
 
 #include <cassert>
@@ -127,6 +129,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.instruct = true;
         } else if (arg == "--color") {
             params.use_color = true;
+        } else if (arg == "--mlock") {
+            params.use_mlock = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -194,6 +198,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
+    if (ggml_mlock_supported()) {
+        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+    }
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
diff --git a/utils.h b/utils.h
index 8120c123b..cf914990c 100644
--- a/utils.h
+++ b/utils.h
@@ -46,6 +46,7 @@ struct gpt_params {
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool ignore_eos        = false; // do not stop generating after eos
     bool perplexity        = false; // compute perplexity over the prompt
+    bool use_mlock         = false; // use mlock to keep model in memory
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

From 481044d50cfe8eaa6cd0c1a1b445680e4b0b3ebc Mon Sep 17 00:00:00 2001
From: Cameron Kaiser <classilla@users.noreply.github.com>
Date: Fri, 24 Mar 2023 08:19:26 -0700
Subject: [PATCH 14/20] additional optimizations for POWER9 (#454)

---
 Makefile |  3 +-
 ggml.c   | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 91eebaebd..e8b128cb8 100644
--- a/Makefile
+++ b/Makefile
@@ -156,7 +156,8 @@ endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS += -mpower9-vector
+		CFLAGS += -mcpu=power9
+		CXXFLAGS += -mcpu=power9
 	endif
 	# Require c++23's std::byteswap for big-endian support.
 	ifeq ($(UNAME_M),ppc64)
diff --git a/ggml.c b/ggml.c
index 800390a88..92b857a00 100644
--- a/ggml.c
+++ b/ggml.c
@@ -175,6 +175,39 @@ typedef double ggml_float;
 #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
 #else
 
 // FP16 <-> FP32
@@ -272,6 +305,7 @@ static float table_f32_f16[1 << 16];
 
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
 
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
@@ -462,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     assert(k % QK == 0);
 
-#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
+#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
     const int nb = k / QK;
     const size_t bs = sizeof(float) + QK/2;
 
@@ -472,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     uint8_t pp[QK/2];
 #endif
 
-#if __ARM_NEON
+#if defined(__POWER9_VECTOR__)
+#if QK == 32
+    const vector float v85 = vec_splats(8.5f);
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+
+        for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
+        for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
+
+        for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
+        //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
+        amaxv[0] = vec_max(amaxv[0], amaxv[2]);
+        amaxv[4] = vec_max(amaxv[4], amaxv[6]);
+        //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
+        amaxv[0] = vec_max(amaxv[0], amaxv[4]);
+
+        amax = MAX(
+                MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
+                MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 3) - 1);
+        const float id = d ? 1.0/d : 0.0;
+
+        *(float *)pd = d;
+        pd += bs;
+
+        const vector float vid = vec_splats(id);
+        for (int l = 0; l < 8; l++) {
+            const vector float vf  = vec_madd(srcv[l], vid, v85);
+            const vector signed int vi = vec_signed(vf);
+
+            pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
+            pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
+        }
+
+        //memcpy(pb, pp, sizeof(pp));
+        pb += bs;
+    }
+#else
+#error "not implemented for QK"
+#endif
+#elif __ARM_NEON
 #if QK == 32
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max

From afd220d9c665e4c19107120ace2f0cb742e28aa1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 17:21:01 +0200
Subject: [PATCH 15/20] Properly free llama_context on failure

---
 llama.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 5d56cc90e..cdb862828 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1432,16 +1432,16 @@ struct llama_context * llama_init_from_file(
     if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
                           params.vocab_only)) {
         fprintf(stderr, "%s: failed to load model\n", __func__);
-        delete ctx;
+        llama_free(ctx);
         return nullptr;
     }
-    
+
     if (params.use_mlock) {
         char *err;
         if (!ggml_mlock(ctx->model.ctx, &err)) {
             fprintf(stderr, "%s\n", err);
             free(err);
-            delete ctx;
+            llama_free(ctx);
             return nullptr;
         }
     }
@@ -1464,7 +1464,9 @@ struct llama_context * llama_init_from_file(
 }
 
 void llama_free(struct llama_context * ctx) {
-    ggml_free(ctx->model.ctx);
+    if (ctx->model.ctx) {
+        ggml_free(ctx->model.ctx);
+    }
 
     delete ctx;
 }

From 863f65e2e32dc1e6d23c96a4811bf382d6b2a548 Mon Sep 17 00:00:00 2001
From: rabidcopy <rabidcopy@yahoo.com>
Date: Fri, 24 Mar 2023 10:22:39 -0500
Subject: [PATCH 16/20] fix instruct mode (#445)

changes to EOS behavior in interactive and reverse prompt handling broke instruct mode by erroneously injecting instruct mode's reverse prompt and an extra newline.
---
 main.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/main.cpp b/main.cpp
index 39dfc575b..44437750e 100644
--- a/main.cpp
+++ b/main.cpp
@@ -387,7 +387,7 @@ int main(int argc, char ** argv) {
             }
 
             // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive) {
+            if (id == llama_token_eos() && params.interactive && !params.instruct) {
                 id = llama_token_newline.front();
                 if (params.antiprompt.size() != 0) {
                     // tokenize and inject first reverse prompt
@@ -488,8 +488,12 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (embd.back() == llama_token_eos()) {
-            fprintf(stderr, " [end of text]\n");
-            break;
+            if (params.instruct) {
+                is_interacting = true;
+            } else {
+                fprintf(stderr, " [end of text]\n");
+                break;
+            }
         }
 
         // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.

From f4f5362edb01b05c383b23f36d7b3489c77061b5 Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Fri, 24 Mar 2023 15:23:09 +0000
Subject: [PATCH 17/20] Update README.md (#444)

Added explicit **bolded** instructions clarifying that people need to request access to models from Facebook and never through through this repo.
---
 README.md | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 06799b5b3..0830074bf 100644
--- a/README.md
+++ b/README.md
@@ -219,9 +219,11 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
 
-* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
-* Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
-* The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
+- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
+- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. 
+- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
+- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
+- The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
 
   `sha256sum --ignore-missing -c SHA256SUMS` on Linux
 
@@ -229,15 +231,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 
   `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
 
-* If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
-  * LLaMA:
-    * [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
-    * [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
-  * GPT-3
-    * [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
-  * GPT-3.5 / InstructGPT / ChatGPT:
-    * [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
-    * [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+- If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
+  - LLaMA:
+    - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
+    - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+  - GPT-3
+    - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
+  - GPT-3.5 / InstructGPT / ChatGPT:
+    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
+    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
     
 ### Perplexity (Measuring model quality)
 

From 31572d966531f7d768eb773322016ab78eb6e835 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 18:23:56 +0200
Subject: [PATCH 18/20] Temporary bump the memory buffer size - hopefully fix
 issues from 483bab2e

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index cdb862828..9a93409cc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -632,7 +632,7 @@ static bool llama_eval_internal(
     auto & mem_per_token = lctx.mem_per_token;
 
     // TODO: fix this hardcoded size
-    static size_t buf_size = 512u*1024*1024;
+    static size_t buf_size = 2048u*1024*1024; // TMP !!!
     static void * buf = malloc(buf_size);
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {

From 7a9b6c3a8bdc1cb75fefc826dfaa7331eb63695d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 23:17:37 +0200
Subject: [PATCH 19/20] Reduce memory usage and allocate enough memory for
 largest context (#473)

* Reduce memory usage and allocate enough memory for large contexts

* Simpler scratch buffer usage

* Reenable BLAS for quantized mul_mat

* Fix number of layers in 30B and 65B

* Fix KV cache size for F32
---
 ggml.c    |  12 +-
 llama.cpp | 326 ++++++++++++++++++++++++++++++++++++++++++++----------
 main.cpp  |  23 +++-
 utils.cpp |  10 +-
 utils.h   |  16 +--
 5 files changed, 307 insertions(+), 80 deletions(-)

diff --git a/ggml.c b/ggml.c
index 92b857a00..cfdf427df 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5846,7 +5846,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    UNUSED(src0);
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
 
     const int ne10 = src1->ne[0];
 
@@ -5856,7 +5857,14 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     // TODO: find the optimal values for these
     if (ggml_is_contiguous(src0) &&
         ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
-        //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
+
+        //// disable BLAS for Q4_0 and Q4_1
+        //// looks like there is no benefit and we only waste a lot of memory
+        //if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) {
+        //    return false;
+        //}
+
+        //printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);
         return true;
     }
 
diff --git a/llama.cpp b/llama.cpp
index 9a93409cc..9d48ccd4c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5,12 +5,25 @@
 #include <cinttypes>
 #include <fstream>
 #include <random>
+#include <map>
 #include <unordered_map>
 #include <queue>
 #include <regex>
 #include <cassert>
 #include <cstring>
 
+#define LLAMA_USE_SCRATCH
+#define LLAMA_MAX_SCRATCH_BUFFERS 16
+
+#define LLAMA_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+
 // determine number of model parts based on the dimension
 static const std::unordered_map<int, int> LLAMA_N_PARTS = {
     { 4096, 1 },
@@ -19,6 +32,52 @@ static const std::unordered_map<int, int> LLAMA_N_PARTS = {
     { 8192, 8 },
 };
 
+// available llama models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_7B,
+    MODEL_13B,
+    MODEL_30B,
+    MODEL_65B,
+};
+
+static const size_t MB = 1024*1024;
+
+// computed for n_ctx == 2048
+// TODO: dynamically determine these sizes
+//       needs modifications in ggml
+
+static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
+    { MODEL_7B,    512ull*MB },
+    { MODEL_13B,   512ull*MB },
+    { MODEL_30B,   512ull*MB },
+    { MODEL_65B,   512ull*MB },
+};
+
+static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
+    { MODEL_7B,    512ull*MB },
+    { MODEL_13B,   512ull*MB },
+    { MODEL_30B,   512ull*MB },
+    { MODEL_65B,   512ull*MB },
+};
+
+// 2*n_embd*n_ctx*n_layer*sizeof(float16)
+static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
+    { MODEL_7B,   1026ull*MB },
+    { MODEL_13B,  1608ull*MB },
+    { MODEL_30B,  3124ull*MB },
+    { MODEL_65B,  5120ull*MB },
+};
+
+// this is mostly needed for temporary mul_mat buffers to dequantize the data
+// not actually needed if BLAS is disabled
+static const std::map<e_model, size_t> MEM_REQ_EVAL = {
+    { MODEL_7B,   768ull*MB },
+    { MODEL_13B, 1024ull*MB },
+    { MODEL_30B, 1280ull*MB },
+    { MODEL_65B, 1536ull*MB },
+};
+
 // default hparams (LLaMA 7B)
 struct llama_hparams {
     int32_t n_vocab = 32000;
@@ -50,7 +109,20 @@ struct llama_layer {
     struct ggml_tensor * w3;
 };
 
+struct llama_kv_cache {
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    struct ggml_context * ctx;
+
+    std::vector<uint8_t> buf;
+
+    int n; // number of tokens currently in the cache
+};
+
 struct llama_model {
+    e_model type = MODEL_UNKNOWN;
+
     llama_hparams hparams;
 
     struct ggml_tensor * tok_embeddings;
@@ -60,12 +132,18 @@ struct llama_model {
 
     std::vector<llama_layer> layers;
 
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
+    // context
     struct ggml_context * ctx;
+
+    // key + value cache for the self attention
+    // TODO: move to llama_state
+    struct llama_kv_cache kv_self;
+
+    // the model memory buffer
+    std::vector<uint8_t> buf;
+
+    // tensors
+    int n_loaded;
     std::unordered_map<std::string, struct ggml_tensor *> tensors;
 };
 
@@ -105,8 +183,88 @@ struct llama_context {
 
     // input embedding (1-dimensional array: [n_embd])
     std::vector<float> embedding;
+
+    // memory buffers used to evaluate the model
+    // TODO: move in llama_state
+    std::vector<uint8_t> buf_compute;
+    std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
+
+    int    buf_last = 0;
+    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
+
+    void use_buf(struct ggml_context * ctx, int i) {
+#if defined(LLAMA_USE_SCRATCH)
+        size_t last_size = 0;
+
+        if (i == -1) {
+            last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
+        } else {
+            auto & buf = buf_scratch[i];
+            last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
+        }
+
+        if (buf_last >= 0) {
+            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
+        }
+
+        buf_last = i;
+#else
+        (void) i;
+        (void) ctx;
+#endif
+    }
+
+    size_t get_buf_max_mem(int i) const {
+#if defined(LLAMA_USE_SCRATCH)
+        return buf_max_size[i];
+#else
+        (void) i;
+        return 0;
+#endif
+    }
 };
 
+//
+// kv cache
+//
+
+static bool kv_cache_init(
+        const struct llama_hparams & hparams,
+             struct llama_kv_cache & cache,
+                         ggml_type   wtype,
+                               int   n_ctx) {
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+
+    const int n_mem      = n_layer*n_ctx;
+    const int n_elements = n_embd*n_mem;
+
+    cache.buf.resize(2*n_elements*ggml_type_size(wtype) + 2u*MB);
+
+    struct ggml_init_params params;
+    params.mem_size   = cache.buf.size();
+    params.mem_buffer = cache.buf.data();
+
+    cache.ctx = ggml_init(params);
+
+    if (!cache.ctx) {
+        fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+        return false;
+    }
+
+    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+
+    return true;
+}
+
+static void kv_cache_free(struct llama_kv_cache & cache) {
+    if (cache.ctx) {
+        ggml_free(cache.ctx);
+        cache.ctx = nullptr;
+    }
+}
+
 struct llama_context_params llama_context_default_params() {
     struct llama_context_params result = {
         /*.n_ctx      =*/ 512,
@@ -204,6 +362,22 @@ static bool llama_model_load(
             fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
         }
 
+        if (hparams.n_layer == 32) {
+            model.type = e_model::MODEL_7B;
+        }
+
+        if (hparams.n_layer == 40) {
+            model.type = e_model::MODEL_13B;
+        }
+
+        if (hparams.n_layer == 60) {
+            model.type = e_model::MODEL_30B;
+        }
+
+        if (hparams.n_layer == 80) {
+            model.type = e_model::MODEL_65B;
+        }
+
         fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
         fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
@@ -214,6 +388,7 @@ static bool llama_model_load(
         fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
         fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
         fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
+        fprintf(stderr, "%s: type    = %d\n", __func__, model.type);
     }
 
     // load vocab
@@ -307,11 +482,32 @@ static bool llama_model_load(
         fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
 
+    // print memory requirements
+    {
+        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
+
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+            ctx_size +
+            MEM_REQ_SCRATCH0.at(model.type) +
+            MEM_REQ_SCRATCH1.at(model.type) +
+            MEM_REQ_EVAL.at    (model.type);
+
+        // this is the memory required by one llama_state
+        const size_t mem_required_state =
+            scale*MEM_REQ_KV_SELF.at(model.type);
+
+        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
+    }
+
     // create the ggml context
     {
+        lctx.model.buf.resize(ctx_size);
+
         struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
+            /*.mem_size   =*/ lctx.model.buf.size(),
+            /*.mem_buffer =*/ lctx.model.buf.data(),
         };
 
         model.ctx = ggml_init(params);
@@ -374,25 +570,6 @@ static bool llama_model_load(
         }
     }
 
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
     const size_t file_offset = fin.tellg();
 
     fin.close();
@@ -416,9 +593,10 @@ static bool llama_model_load(
 
         // load weights
         {
-            int n_tensors = 0;
             size_t total_size = 0;
 
+            model.n_loaded = 0;
+
             fprintf(stderr, "%s: ", __func__);
 
             while (true) {
@@ -583,7 +761,10 @@ static bool llama_model_load(
                 }
 
                 //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-                if (++n_tensors % 8 == 0) {
+                model.n_loaded++;
+
+                // progress
+                if (model.n_loaded % 8 == 0) {
                     fprintf(stderr, ".");
                     fflush(stderr);
                 }
@@ -591,7 +772,13 @@ static bool llama_model_load(
 
             fprintf(stderr, " done\n");
 
-            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
+            if (model.n_loaded == 0) {
+                fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+            } else if (model.n_loaded != (int) model.tensors.size()) {
+                fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+                return false;
+            }
         }
 
         fin.close();
@@ -622,6 +809,10 @@ static bool llama_eval_internal(
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
+    auto & kv_self = model.kv_self;
+
+    LLAMA_ASSERT(!!kv_self.ctx);
+
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
     const int n_ctx   = hparams.n_ctx;
@@ -630,27 +821,11 @@ static bool llama_eval_internal(
     const int n_rot   = hparams.n_embd/hparams.n_head;
 
     auto & mem_per_token = lctx.mem_per_token;
-
-    // TODO: fix this hardcoded size
-    static size_t buf_size = 2048u*1024*1024; // TMP !!!
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
-        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
+    auto & buf_compute   = lctx.buf_compute;
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
+        /*.mem_size   =*/ buf_compute.size(),
+        /*.mem_buffer =*/ buf_compute.data(),
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -667,6 +842,8 @@ static bool llama_eval_internal(
 
         struct ggml_tensor * cur;
 
+        lctx.use_buf(ctx0, 0);
+
         // norm
         {
             cur = ggml_rms_norm(ctx0, inpL);
@@ -685,8 +862,8 @@ static bool llama_eval_internal(
 
             // store key and value to memory
             if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
 
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
@@ -707,7 +884,7 @@ static bool llama_eval_internal(
                 ggml_permute(ctx0,
                         ggml_rope(ctx0,
                             ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                                ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
                                 n_embd/n_head, n_head, n_past + N),
                             n_past, n_rot, 1),
                         0, 2, 1, 3);
@@ -733,7 +910,7 @@ static bool llama_eval_internal(
                 ggml_cpy(ctx0,
                     ggml_permute(ctx0,
                             ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
                                 n_embd/n_head, n_head, n_past + N),
                             1, 2, 0, 3),
                     ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
@@ -755,6 +932,8 @@ static bool llama_eval_internal(
                     cur);
         }
 
+        lctx.use_buf(ctx0, 1);
+
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
 
         // feed-forward network
@@ -773,7 +952,6 @@ static bool llama_eval_internal(
                     model.layers[il].w3,
                     cur);
 
-
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].w1,
                     cur);
@@ -788,17 +966,20 @@ static bool llama_eval_internal(
                     cur);
         }
 
-        cur  = ggml_add(ctx0, cur, inpFF);
+        cur = ggml_add(ctx0, cur, inpFF);
 
         // input for next layer
         inpL = cur;
     }
 
+    lctx.use_buf(ctx0, 0);
+
     // used at the end to optionally extract the embeddings
     struct ggml_tensor * embeddings = NULL;
 
     // norm
     {
+
         inpL = ggml_rms_norm(ctx0, inpL);
 
         // inpL = norm*inpL
@@ -810,9 +991,9 @@ static bool llama_eval_internal(
     }
 
     // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.output, inpL);
-    }
+    inpL = ggml_mul_mat(ctx0, model.output, inpL);
+
+    lctx.use_buf(ctx0, -1);
 
     // logits -> probs
     //inpL = ggml_soft_max(ctx0, inpL);
@@ -854,7 +1035,13 @@ static bool llama_eval_internal(
     if (mem_per_token == 0) {
         mem_per_token = ggml_used_mem(ctx0)/N;
     }
-    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
+
+#if 0
+    printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
+            ggml_used_mem(ctx0)/1024.0/1024.0,
+            lctx.get_buf_max_mem(0)/1024.0/1024.0,
+            lctx.get_buf_max_mem(1)/1024.0/1024.0);
+#endif
 
     ggml_free(ctx0);
 
@@ -1427,9 +1614,9 @@ struct llama_context * llama_init_from_file(
     ctx->rng = std::mt19937(params.seed);
     ctx->logits_all = params.logits_all;
 
-    ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
                           params.vocab_only)) {
         fprintf(stderr, "%s: failed to load model\n", __func__);
         llama_free(ctx);
@@ -1448,6 +1635,17 @@ struct llama_context * llama_init_from_file(
 
     // reserve memory for context buffers
     {
+        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
+            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
+            llama_free(ctx);
+            return nullptr;
+        }
+
+        {
+            const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
+            fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+        }
+
         const auto & hparams = ctx->model.hparams;
         if (params.logits_all) {
             ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
@@ -1458,12 +1656,19 @@ struct llama_context * llama_init_from_file(
         if (params.embedding){
             ctx->embedding.reserve(hparams.n_embd);
         }
+
+        ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
+
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
+        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
     }
 
     return ctx;
 }
 
 void llama_free(struct llama_context * ctx) {
+    kv_cache_free(ctx->model.kv_self);
+
     if (ctx->model.ctx) {
         ggml_free(ctx->model.ctx);
     }
@@ -1619,4 +1824,3 @@ const char * llama_print_system_info(void) {
 
     return s.c_str();
 }
-
diff --git a/main.cpp b/main.cpp
index 44437750e..bc71a5494 100644
--- a/main.cpp
+++ b/main.cpp
@@ -217,11 +217,23 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
-    // determine the required inference memory per token:
-    // TODO: better way to do that
-    {
-        const std::vector<llama_token> tmp = { 0, 1, 2, 3 };
-        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
+    // uncomment the "used_mem" line in llama.cpp to see the results
+    if (params.mem_test) {
+        {
+            const std::vector<llama_token> tmp(params.n_batch, 0);
+            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        }
+
+        {
+            const std::vector<llama_token> tmp = { 0, };
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+        }
+
+        llama_print_timings(ctx);
+        llama_free(ctx);
+
+        return 0;
     }
 
     if (params.perplexity) {
@@ -508,7 +520,6 @@ int main(int argc, char ** argv) {
 #endif
 
     llama_print_timings(ctx);
-
     llama_free(ctx);
 
     set_console_state(CONSOLE_STATE_DEFAULT);
diff --git a/utils.cpp b/utils.cpp
index 10673fb82..2f995c12d 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -79,8 +79,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--memory_f16") {
-            params.memory_f16 = true;
+        } else if (arg == "--memory_f32") {
+            params.memory_f16 = false;
         } else if (arg == "--top_p") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -111,6 +111,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_batch = std::stoi(argv[i]);
+            params.n_batch = std::min(512, params.n_batch);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -131,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.use_color = true;
         } else if (arg == "--mlock") {
             params.use_mlock = true;
+        } else if (arg == "--mtest") {
+            params.mem_test = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -193,7 +196,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
     fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
     fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating\n");
-    fprintf(stderr, "  --memory_f16          use f16 instead of f32 for memory key+value\n");
+    fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
     fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -201,6 +204,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     if (ggml_mlock_supported()) {
         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
+    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
diff --git a/utils.h b/utils.h
index cf914990c..d469bc6a0 100644
--- a/utils.h
+++ b/utils.h
@@ -14,12 +14,13 @@
 //
 
 struct gpt_params {
-    int32_t seed          = -1;  // RNG seed
+    int32_t seed          = -1;   // RNG seed
     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict     = 128; // new tokens to predict
-    int32_t repeat_last_n = 64;  // last n tokens to penalize
-    int32_t n_parts       = -1;  // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512; //context size
+    int32_t n_predict     = 128;  // new tokens to predict
+    int32_t repeat_last_n = 64;   // last n tokens to penalize
+    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
+    int32_t n_ctx         = 512;  // context size
+    int32_t n_batch       = 8;    // batch size for prompt processing
 
     // sampling parameters
     int32_t top_k = 40;
@@ -27,15 +28,13 @@ struct gpt_params {
     float   temp  = 0.80f;
     float   repeat_penalty  = 1.10f;
 
-    int32_t n_batch = 8; // batch size for prompt processing
-
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
 
 
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
-    bool memory_f16        = false; // use f16 instead of f32 for memory kv
+    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
@@ -47,6 +46,7 @@ struct gpt_params {
     bool ignore_eos        = false; // do not stop generating after eos
     bool perplexity        = false; // compute perplexity over the prompt
     bool use_mlock         = false; // use mlock to keep model in memory
+    bool mem_test          = false; // compute maximum memory usage
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

From 04c6f5ed6fafd63601fa06757877ed5ccf9d5991 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 23:17:58 +0200
Subject: [PATCH 20/20] Immediately start processing the prompt before user
 input has been provided (#476)

---
 alpaca.sh          |  2 +-
 chat.sh            |  2 +-
 examples/chatLLaMa |  2 +-
 main.cpp           | 14 ++++++++++----
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/alpaca.sh b/alpaca.sh
index 2f36d6f54..d8a9f456a 100755
--- a/alpaca.sh
+++ b/alpaca.sh
@@ -3,4 +3,4 @@
 # Temporary script - will be removed in the future
 #
 
-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
+./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
diff --git a/chat.sh b/chat.sh
index 24a0f10ad..5531315b3 100755
--- a/chat.sh
+++ b/chat.sh
@@ -3,4 +3,4 @@
 # Temporary script - will be removed in the future
 #
 
-./main -m ./models/7B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
diff --git a/examples/chatLLaMa b/examples/chatLLaMa
index 97ababbc5..4265d7b66 100755
--- a/examples/chatLLaMa
+++ b/examples/chatLLaMa
@@ -13,7 +13,7 @@ N_PREDICTS="${N_PREDICTS:-2048}"
 
 # Note: you can also override the generation options by specifying them on the command line:
 # For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --repeat_penalty 1.17647}"
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
 
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
 ./main $GEN_OPTIONS \
diff --git a/main.cpp b/main.cpp
index bc71a5494..3f49ad997 100644
--- a/main.cpp
+++ b/main.cpp
@@ -372,7 +372,7 @@ int main(int argc, char ** argv) {
         n_past += embd.size();
         embd.clear();
 
-        if ((int) embd_inp.size() <= input_consumed) {
+        if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
             // out of user input, sample next token
             const float top_k          = params.top_k;
             const float top_p          = params.top_p;
@@ -451,13 +451,16 @@ int main(int argc, char ** argv) {
             }
 
             // Check if each of the reverse prompts appears at the end of the output.
-            for (std::string antiprompt : params.antiprompt) {
+            for (std::string & antiprompt : params.antiprompt) {
                 if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                     is_interacting = true;
+                    set_console_state(CONSOLE_STATE_USER_INPUT);
+                    fflush(stdout);
                     break;
                 }
             }
-            if (is_interacting) {
+
+            if (n_past > 0 && is_interacting) {
                 // potentially set color to indicate we are taking user input
                 set_console_state(CONSOLE_STATE_USER_INPUT);
 
@@ -495,7 +498,10 @@ int main(int argc, char ** argv) {
 
                 input_noecho = true; // do not echo this again
             }
-            is_interacting = false;
+
+            if (n_past > 0) {
+                is_interacting = false;
+            }
         }
 
         // end of text token