From a18c19259a3cb9dec332d613e8f15704f678a468 Mon Sep 17 00:00:00 2001
From: Ben Siraphob <bensiraphob@gmail.com>
Date: Wed, 22 Mar 2023 00:37:02 -0500
Subject: [PATCH 01/12] Fix Nix build

---
 flake.nix | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/flake.nix b/flake.nix
index da4bd7ba3..4c2717e0d 100644
--- a/flake.nix
+++ b/flake.nix
@@ -28,8 +28,8 @@
           ];
           installPhase = ''
             mkdir -p $out/bin
-            mv llama $out/bin/llama
-            mv quantize $out/bin/quantize
+            mv bin/main $out/bin/llama
+            mv bin/quantize $out/bin/quantize
             echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
             cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
             chmod +x $out/bin/convert-pth-to-ggml

From ea10d3ded2994106596ddf8e4ed02741b3e053e6 Mon Sep 17 00:00:00 2001
From: anzz1 <anzz1@live.com>
Date: Thu, 23 Mar 2023 19:54:28 +0200
Subject: [PATCH 02/12] Command line args bounds checking (#424)

* command line args bounds checking

* unknown and invalid param exit codes 0 -> 1
---
 utils.cpp | 101 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 17 deletions(-)

diff --git a/utils.cpp b/utils.cpp
index 1d5309c3a..45c9cabb1 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -26,41 +26,95 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
     }
 
+    bool invalid_param = false;
+    std::string arg;
     for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
+        arg = argv[i];
 
         if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seed = std::stoi(argv[i]);
         } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
         } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prompt = argv[i];
         } else if (arg == "-f" || arg == "--file") {
-            std::ifstream file(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
             if (params.prompt.back() == '\n') {
                 params.prompt.pop_back();
             }
         } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_predict = std::stoi(argv[i]);
         } else if (arg == "--top_k") {
-            params.top_k = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx_size") {
-            params.n_ctx = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ctx = std::stoi(argv[i]);
         } else if (arg == "--memory_f16") {
             params.memory_f16 = true;
         } else if (arg == "--top_p") {
-            params.top_p = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
-            params.temp = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.temp = std::stof(argv[i]);
         } else if (arg == "--repeat_last_n") {
-            params.repeat_last_n = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat_penalty") {
-            params.repeat_penalty = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_batch = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--interactive-first") {
@@ -70,13 +124,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--color") {
             params.use_color = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
-            params.antiprompt.push_back(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.antiprompt.push_back(argv[i]);
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--n_parts") {
-            params.n_parts = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_parts = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             gpt_print_usage(argc, argv, params);
             exit(0);
@@ -85,9 +147,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             gpt_print_usage(argc, argv, params);
-            exit(0);
+            exit(1);
         }
     }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        gpt_print_usage(argc, argv, params);
+        exit(1);
+    }
 
     return true;
 }

From ad072fc5ad6f6905a7224ff6ea07c0644aa075b1 Mon Sep 17 00:00:00 2001
From: nusu-github <29514220+nusu-github@users.noreply.github.com>
Date: Fri, 24 Mar 2023 05:16:48 +0900
Subject: [PATCH 03/12] Generate library with CMake (#430)

* Generate library with CMake

BUILD_SHARED_LIBS to allow llama library to be generated.

* Turn ON PIC when BUILD_SHARED_LIBS is ON
---
 CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d952afb4f..51af97c4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -218,6 +218,9 @@ add_library(utils OBJECT
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
 target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
 
 add_library(ggml OBJECT
             ggml.c
@@ -226,6 +229,9 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
 
 add_library(llama
             llama.cpp
@@ -234,6 +240,10 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+endif()
 
 #
 # Executables

From 20a1a4e09c522a80e2a0db51643d25fa38326065 Mon Sep 17 00:00:00 2001
From: Timmy Knight <r2d2fish@gmail.com>
Date: Thu, 23 Mar 2023 10:18:13 -1000
Subject: [PATCH 04/12] Fix GPTQ converter (#423)

* Fix GPTQ converter

* Fix comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-gptq-to-ggml.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py
index 7fccb4d56..6c77808fc 100644
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -36,7 +36,8 @@ fname_out = sys.argv[3]
 
 fout = open(fname_out, "wb")
 
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
+fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
@@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
     if tokenizer.is_unknown(i):
-        # "<unk>" token (translated as ??)
         text = " \u2047 ".encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
     elif tokenizer.is_control(i):
-        # "<s>"/"</s>" tokens
-        fout.write(struct.pack("i", 0))
+        text = b""
     elif tokenizer.is_byte(i):
-        # "<U+XX>" tokens (which may be invalid UTF-8)
         piece = tokenizer.id_to_piece(i)
         if len(piece) != 6:
-            print("Invalid token: " + piece)
+            print(f"Invalid token: {piece}")
             sys.exit(1)
         byte_value = int(piece[3:-1], 16)
-        fout.write(struct.pack("i", 1))
-        fout.write(struct.pack("B", byte_value))
+        text = struct.pack("B", byte_value)
     else:
-        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
         text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    fout.write(struct.pack("f", tokenizer.get_score(i)))
 
 def write_header(shape, dst_name, ftype_cur):
     sname = dst_name.encode('utf-8')

From 2e17dfd80a473099dacc0f41c9146d233c6a5972 Mon Sep 17 00:00:00 2001
From: rabidcopy <rabidcopy@yahoo.com>
Date: Thu, 23 Mar 2023 15:22:47 -0500
Subject: [PATCH 05/12] Replace EOS with newline to prevent context/memory
 being flushed by EOS in interactive mode (#333)

* Improve interactive mode's coherence after EOS

Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached.
Not sure what token 13 is or why it seems to help. See conversation for examples.

* Make newline token a constant

* dynamically determine newline token

* relocate previous newline token const

* cleanup whitespace

* print a new line on end of text in interactive

this may need to be looked into further when not using a reverse prompt

* only print manual newline with reverse prompt

fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise

* alternate approach to replace end of text tokens

* Inject the reverse prompt again after eos in interactive mode

* tokenize reverse prompt when needed

makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330

* tokenize and inject only first reverse prompt

thanks to tjohnman

* tokenize first reverse prompt once

* add newline token

* add newline token

* tokenize/inject reverse prompt for refactor

this doesn't seem right though

* tokenize nothing for antiprompt if no reverse

* Update main.cpp

* Update main.cpp

* tokenize and inject reverse prompt as needed

this doesn't seem to work if the reverse prompt is tokenized outside earlier on

* not needed

* remove newline token

* remove newline token

* tokenize newline token

* add space to comment

* Update main.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 main.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/main.cpp b/main.cpp
index 431c94b52..5ba6d5a75 100644
--- a/main.cpp
+++ b/main.cpp
@@ -258,6 +258,9 @@ int main(int argc, char ** argv) {
         params.interactive = true;
     }
 
+    // determine newline token
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
     fprintf(stderr, "\n");
     fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
     fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@@ -359,6 +362,16 @@ int main(int argc, char ** argv) {
                 last_n_tokens.push_back(id);
             }
 
+            // replace end of text token with newline token when in interactive mode
+            if (id == llama_token_eos() && params.interactive) {
+                id = llama_token_newline.front();
+                if (params.antiprompt.size() != 0) {
+                    // tokenize and inject first reverse prompt
+                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                }
+            }
+
             // add it to the context
             embd.push_back(id);
 
@@ -451,12 +464,8 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (embd.back() == llama_token_eos()) {
-            if (params.interactive) {
-                is_interacting = true;
-            } else {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
+            fprintf(stderr, " [end of text]\n");
+            break;
         }
 
         // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.

From 0ba5a3a9a5efedb1aeecbbc70a4e9825542472d5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 22:32:02 +0200
Subject: [PATCH 06/12] Obsolete

---
 download-pth.py | 66 -------------------------------------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 download-pth.py

diff --git a/download-pth.py b/download-pth.py
deleted file mode 100644
index 129532c0c..000000000
--- a/download-pth.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-import sys
-from tqdm import tqdm
-import requests
-
-if len(sys.argv) < 3:
-    print("Usage: download-pth.py dir-model model-type\n")
-    print("  model-type: Available models 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-modelsDir = sys.argv[1]
-model = sys.argv[2]
-
-num = {
-    "7B": 1,
-    "13B": 2,
-    "30B": 4,
-    "65B": 8,
-}
-
-if model not in num:
-    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-print(f"Downloading model {model}")
-
-files = ["checklist.chk", "params.json"]
-
-for i in range(num[model]):
-    files.append(f"consolidated.0{i}.pth")
-
-resolved_path = os.path.abspath(os.path.join(modelsDir, model))
-os.makedirs(resolved_path, exist_ok=True)
-
-for file in files:
-    dest_path = os.path.join(resolved_path, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-
-    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
-
-files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
-for file in files2:
-    dest_path = os.path.join(modelsDir, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-    
-    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
\ No newline at end of file

From 4cc053b6d5e9df7ac21fa06b7208a70c156d4d7a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 22:39:44 +0200
Subject: [PATCH 07/12] Remove oboslete command from Docker script

---
 .devops/tools.sh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.devops/tools.sh b/.devops/tools.sh
index 352e04942..b0196b60d 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
     ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
     ./main $arg2
-elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
-    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
-    echo "Downloading model..."
-    python3 ./download-pth.py "$1" "$2"
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
         if [ -f "${i/f16/q4_0}" ]; then
@@ -39,8 +35,6 @@ else
     echo "              ex: \"/models/7B/\" 1"
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
-    echo "              ex: \"/models/\" 7B"
-    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
+    echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
 fi

From 404e1da38ec8025707031a8027da14dc1590f952 Mon Sep 17 00:00:00 2001
From: Jed Fox <git@jedfox.com>
Date: Thu, 23 Mar 2023 16:42:52 -0400
Subject: [PATCH 08/12] Fix quantize script not finding models in parent
 directory (#428)

---
 quantize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quantize.py b/quantize.py
index 6320b0a26..16b5963d3 100644
--- a/quantize.py
+++ b/quantize.py
@@ -57,6 +57,7 @@ def main():
     # )
 
     args = parser.parse_args()
+    args.models_path = os.path.abspath(args.models_path)
 
     if not os.path.isfile(args.quantize_script_path):
         print(

From 483bab2e3d4a868fe679d8bb32827d2a4df214dc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 23:22:01 +0200
Subject: [PATCH 09/12] Avoid the transposed X branch in the Z = X * Y matrix
 multiplication (#439)

Should make results reproducible for different number of threads and batch sizes
---
 llama.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 7de3c19c8..d55219256 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,11 +727,13 @@ static bool llama_eval_internal(
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                ggml_cpy(ctx0,
+                    ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
 
             // KQV = transpose(V) * KQ_soft_max
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

From 4870e455b3653f7d7769fa5772b2c90ffad088df Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 00:11:53 +0200
Subject: [PATCH 10/12] Fix memory allocation issues and seg faults

---
 llama.cpp | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d55219256..cf796cce3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,6 +102,9 @@ struct llama_context {
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
+
+    // work buffer for transformer evaluation
+    std::vector<uint8_t> buf_eval;
 };
 
 struct llama_context_params llama_context_default_params() {
@@ -627,27 +630,19 @@ static bool llama_eval_internal(
     const int n_rot   = hparams.n_embd/hparams.n_head;
 
     auto & mem_per_token = lctx.mem_per_token;
+    auto & buf_eval = lctx.buf_eval;
 
-    // TODO: fix this hardcoded size
-    static size_t buf_size = 512u*1024*1024;
-    static void * buf = malloc(buf_size);
+    if (mem_per_token*(n_past + N + 16) > buf_eval.size()) {
+        const size_t buf_size_new = 1.618*buf_eval.size();
 
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
-        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new);
 
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
+        buf_eval.resize(buf_size_new);
     }
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
+        /*.mem_size   =*/ buf_eval.size(),
+        /*.mem_buffer =*/ buf_eval.data(),
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -832,10 +827,11 @@ static bool llama_eval_internal(
         memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
     }
 
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
+    if (N == 1) {
+        mem_per_token = ggml_used_mem(ctx0)/(n_past + N);
     }
-    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024);
 
     ggml_free(ctx0);
 
@@ -1416,6 +1412,8 @@ struct llama_context * llama_init_from_file(
         return nullptr;
     }
 
+    ctx->buf_eval.resize(512u*1024u*1024u);
+
     return ctx;
 }
 

From 3cd8dde0d1357b7f11bdd25c45d5bf5e97e284a0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 06:22:28 +0200
Subject: [PATCH 11/12] Revert "Fix memory allocation issues and seg faults"

This reverts commit 4870e455b3653f7d7769fa5772b2c90ffad088df.

Will provide the correct fix later
---
 llama.cpp | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cf796cce3..d55219256 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,9 +102,6 @@ struct llama_context {
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
-
-    // work buffer for transformer evaluation
-    std::vector<uint8_t> buf_eval;
 };
 
 struct llama_context_params llama_context_default_params() {
@@ -630,19 +627,27 @@ static bool llama_eval_internal(
     const int n_rot   = hparams.n_embd/hparams.n_head;
 
     auto & mem_per_token = lctx.mem_per_token;
-    auto & buf_eval = lctx.buf_eval;
 
-    if (mem_per_token*(n_past + N + 16) > buf_eval.size()) {
-        const size_t buf_size_new = 1.618*buf_eval.size();
+    // TODO: fix this hardcoded size
+    static size_t buf_size = 512u*1024*1024;
+    static void * buf = malloc(buf_size);
 
-        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new);
+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
+        const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
+        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
-        buf_eval.resize(buf_size_new);
+        // reallocate
+        buf_size = buf_size_new;
+        buf = realloc(buf, buf_size);
+        if (buf == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            return false;
+        }
     }
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_eval.size(),
-        /*.mem_buffer =*/ buf_eval.data(),
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -827,11 +832,10 @@ static bool llama_eval_internal(
         memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
     }
 
-    if (N == 1) {
-        mem_per_token = ggml_used_mem(ctx0)/(n_past + N);
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
     }
-
-    //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024);
+    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
 
     ggml_free(ctx0);
 
@@ -1412,8 +1416,6 @@ struct llama_context * llama_init_from_file(
         return nullptr;
     }
 
-    ctx->buf_eval.resize(512u*1024u*1024u);
-
     return ctx;
 }
 

From b6b268d4415fd3b3e53f22b6619b724d4928f713 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 09:13:35 +0200
Subject: [PATCH 12/12] Add link to Roadmap discussion

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ee8dc1dcb..06799b5b3 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
 - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
-- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105