From d9dfdec2bdb1caa9aea3c272f82ababbb61d664d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 23 Mar 2023 05:33:06 -0400
Subject: [PATCH 01/77] Initial commit (llama_cpp.py, llama-cpp-python)

---
 examples/llama_cpp.py | 216 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 examples/llama_cpp.py

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
new file mode 100644
index 000000000..9e741dfc6
--- /dev/null
+++ b/examples/llama_cpp.py
@@ -0,0 +1,216 @@
+import ctypes
+
+from ctypes import (
+    c_int,
+    c_float,
+    c_double,
+    c_char_p,
+    c_void_p,
+    c_bool,
+    POINTER,
+    Structure,
+)
+
+import pathlib
+
+# Load the library
+libfile = pathlib.Path(__file__).parent / "libllama.so"
+_lib = ctypes.CDLL(str(libfile))
+
+
+# C types
+llama_token = c_int
+llama_token_p = POINTER(llama_token)
+
+
+class llama_token_data(Structure):
+    _fields_ = [
+        ("id", llama_token),  # token id
+        ("p", c_float),  # probability of the token
+        ("plog", c_float),  # log probability of the token
+    ]
+
+
+llama_token_data_p = POINTER(llama_token_data)
+
+
+class llama_context_params(Structure):
+    _fields_ = [
+        ("n_ctx", c_int),  # text context
+        ("n_parts", c_int),  # -1 for default
+        ("seed", c_int),  # RNG seed, 0 for random
+        ("f16_kv", c_bool),  # use fp16 for KV cache
+        (
+            "logits_all",
+            c_bool,
+        ),  # the llama_eval() call computes all logits, not just the last one
+        ("vocab_only", c_bool),  # only load the vocabulary, no weights
+    ]
+
+
+llama_context_params_p = POINTER(llama_context_params)
+
+llama_context_p = c_void_p
+
+# C functions
+lib.llama_context_default_params.argtypes = []
+lib.llama_context_default_params.restype = llama_context_params
+
+lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
+lib.llama_init_from_file.restype = llama_context_p
+
+lib.llama_free.argtypes = [llama_context_p]
+lib.llama_free.restype = None
+
+lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
+lib.llama_model_quantize.restype = c_int
+
+lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
+lib.llama_eval.restype = c_int
+
+lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
+lib.llama_tokenize.restype = c_int
+
+lib.llama_n_vocab.argtypes = [llama_context_p]
+lib.llama_n_vocab.restype = c_int
+
+lib.llama_n_ctx.argtypes = [llama_context_p]
+lib.llama_n_ctx.restype = c_int
+
+lib.llama_get_logits.argtypes = [llama_context_p]
+lib.llama_get_logits.restype = POINTER(c_float)
+
+lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
+lib.llama_token_to_str.restype = c_char_p
+
+lib.llama_token_bos.argtypes = []
+lib.llama_token_bos.restype = llama_token
+
+lib.llama_token_eos.argtypes = []
+lib.llama_token_eos.restype = llama_token
+
+lib.llama_sample_top_p_top_k.argtypes = [
+    llama_context_p,
+    llama_token_p,
+    c_int,
+    c_int,
+    c_double,
+    c_double,
+    c_double,
+]
+lib.llama_sample_top_p_top_k.restype = llama_token
+
+lib.llama_print_timings.argtypes = [llama_context_p]
+lib.llama_print_timings.restype = None
+
+lib.llama_reset_timings.argtypes = [llama_context_p]
+lib.llama_reset_timings.restype = None
+
+lib.llama_print_system_info.argtypes = []
+lib.llama_print_system_info.restype = c_char_p
+
+
+# Python functions
+def llama_context_default_params() -> llama_context_params:
+    return _lib.llama_context_default_params()
+
+
+def llama_init_from_file(
+    path_model: bytes, params: llama_context_params
+) -> llama_context_p:
+    """Various functions for loading a ggml llama model.
+    Allocate (almost) all memory needed for the model.
+    Return NULL on failure"""
+    return _lib.llama_init_from_file(path_model, params)
+
+
+def llama_free(ctx: llama_context_p):
+    """Free all allocated memory"""
+    return _lib.llama_free(ctx)
+
+
+def llama_model_quantize(
+    fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int
+) -> c_int:
+    """Returns 0 on success"""
+    return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk)
+
+
+def llama_eval(
+    ctx: llama_context_p,
+    tokens: llama_token_p,
+    n_tokens: c_int,
+    n_past: c_int,
+    n_threads: c_int,
+) -> c_int:
+    """Run the llama inference to obtain the logits and probabilities for the next token.
+    tokens + n_tokens is the provided batch of new tokens to process
+    n_past is the number of tokens to use from previous eval calls
+    Returns 0 on success"""
+    return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
+
+
+def llama_tokenize(
+    ctx: llama_context_p,
+    text: bytes,
+    tokens: llama_token_p,
+    n_max_tokens: c_int,
+    add_bos: c_bool,
+) -> c_int:
+    return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
+
+
+def llama_n_vocab(ctx: llama_context_p) -> c_int:
+    return _lib.llama_n_vocab(ctx)
+
+
+def llama_n_ctx(ctx: llama_context_p) -> c_int:
+    return _lib.llama_n_ctx(ctx)
+
+
+def llama_get_logits(ctx: llama_context_p):
+    """Token logits obtained from the last call to llama_eval()
+    The logits for the last token are stored in the last row
+    Can be mutated in order to change the probabilities of the next token
+    Rows: n_tokens
+    Cols: n_vocab"""
+    return _lib.llama_get_logits(ctx)
+
+
+def llama_token_to_str(ctx: llama_context_p, token: int) -> bytes:
+    """Token Id -> String. Uses the vocabulary in the provided context"""
+    return _lib.llama_token_to_str(ctx, token)
+
+
+def llama_token_bos() -> llama_token:
+    return _lib.llama_token_bos()
+
+
+def llama_token_eos() -> llama_token:
+    return _lib.llama_token_eos()
+
+
+def llama_sample_top_p_top_k(
+    ctx: llama_context_p,
+    last_n_tokens_data: llama_token_p,
+    last_n_tokens_size: c_int,
+    top_k: c_int,
+    top_p: c_double,
+    temp: c_double,
+    repeat_penalty: c_double,
+) -> llama_token:
+    return _lib.llama_sample_top_p_top_k(
+        ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty
+    )
+
+
+def llama_print_timings(ctx: llama_context_p):
+    _lib.llama_print_timings(ctx)
+
+
+def llama_reset_timings(ctx: llama_context_p):
+    _lib.llama_reset_timings(ctx)
+
+
+def llama_print_system_info() -> bytes:
+    return _lib.llama_print_system_info()

From ef5a9a616014828294070311526a802c07b52ec7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 24 Mar 2023 14:58:42 -0400
Subject: [PATCH 02/77] Update llama.cpp and re-organize low-level api

---
 examples/llama_cpp.py | 189 ++++++++++++++++++++++++++----------------
 1 file changed, 116 insertions(+), 73 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 9e741dfc6..638f14238 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -19,6 +19,9 @@ _lib = ctypes.CDLL(str(libfile))
 
 
 # C types
+llama_context_p = c_void_p
+
+
 llama_token = c_int
 llama_token_p = POINTER(llama_token)
 
@@ -45,97 +48,63 @@ class llama_context_params(Structure):
             c_bool,
         ),  # the llama_eval() call computes all logits, not just the last one
         ("vocab_only", c_bool),  # only load the vocabulary, no weights
+        ("use_mlock", c_bool),  # force system to keep model in RAM
+        ("embedding", c_bool),  # embedding mode only
     ]
 
 
 llama_context_params_p = POINTER(llama_context_params)
 
-llama_context_p = c_void_p
 
-# C functions
-lib.llama_context_default_params.argtypes = []
-lib.llama_context_default_params.restype = llama_context_params
-
-lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
-lib.llama_init_from_file.restype = llama_context_p
-
-lib.llama_free.argtypes = [llama_context_p]
-lib.llama_free.restype = None
-
-lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
-lib.llama_model_quantize.restype = c_int
-
-lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
-lib.llama_eval.restype = c_int
-
-lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
-lib.llama_tokenize.restype = c_int
-
-lib.llama_n_vocab.argtypes = [llama_context_p]
-lib.llama_n_vocab.restype = c_int
-
-lib.llama_n_ctx.argtypes = [llama_context_p]
-lib.llama_n_ctx.restype = c_int
-
-lib.llama_get_logits.argtypes = [llama_context_p]
-lib.llama_get_logits.restype = POINTER(c_float)
-
-lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
-lib.llama_token_to_str.restype = c_char_p
-
-lib.llama_token_bos.argtypes = []
-lib.llama_token_bos.restype = llama_token
-
-lib.llama_token_eos.argtypes = []
-lib.llama_token_eos.restype = llama_token
-
-lib.llama_sample_top_p_top_k.argtypes = [
-    llama_context_p,
-    llama_token_p,
-    c_int,
-    c_int,
-    c_double,
-    c_double,
-    c_double,
-]
-lib.llama_sample_top_p_top_k.restype = llama_token
-
-lib.llama_print_timings.argtypes = [llama_context_p]
-lib.llama_print_timings.restype = None
-
-lib.llama_reset_timings.argtypes = [llama_context_p]
-lib.llama_reset_timings.restype = None
-
-lib.llama_print_system_info.argtypes = []
-lib.llama_print_system_info.restype = c_char_p
+# Functions
 
 
-# Python functions
 def llama_context_default_params() -> llama_context_params:
     return _lib.llama_context_default_params()
 
 
+_lib.llama_context_default_params.argtypes = []
+_lib.llama_context_default_params.restype = llama_context_params
+
+
+# Various functions for loading a ggml llama model.
+# Allocate (almost) all memory needed for the model.
+# Return NULL on failure
 def llama_init_from_file(
     path_model: bytes, params: llama_context_params
 ) -> llama_context_p:
-    """Various functions for loading a ggml llama model.
-    Allocate (almost) all memory needed for the model.
-    Return NULL on failure"""
     return _lib.llama_init_from_file(path_model, params)
 
 
+_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
+_lib.llama_init_from_file.restype = llama_context_p
+
+
+# Frees all allocated memory
 def llama_free(ctx: llama_context_p):
-    """Free all allocated memory"""
     return _lib.llama_free(ctx)
 
 
+_lib.llama_free.argtypes = [llama_context_p]
+_lib.llama_free.restype = None
+
+
+# TODO: not great API - very likely to change
+# Returns 0 on success
 def llama_model_quantize(
     fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int
 ) -> c_int:
-    """Returns 0 on success"""
     return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk)
 
 
+_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
+_lib.llama_model_quantize.restype = c_int
+
+
+# Run the llama inference to obtain the logits and probabilities for the next token.
+# tokens + n_tokens is the provided batch of new tokens to process
+# n_past is the number of tokens to use from previous eval calls
+# Returns 0 on success
 def llama_eval(
     ctx: llama_context_p,
     tokens: llama_token_p,
@@ -143,13 +112,18 @@ def llama_eval(
     n_past: c_int,
     n_threads: c_int,
 ) -> c_int:
-    """Run the llama inference to obtain the logits and probabilities for the next token.
-    tokens + n_tokens is the provided batch of new tokens to process
-    n_past is the number of tokens to use from previous eval calls
-    Returns 0 on success"""
     return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
 
 
+_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
+_lib.llama_eval.restype = c_int
+
+
+# Convert the provided text into tokens.
+# The tokens pointer must be large enough to hold the resulting tokens.
+# Returns the number of tokens on success, no more than n_max_tokens
+# Returns a negative number on failure - the number of tokens that would have been returned
+# TODO: not sure if correct
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
@@ -160,36 +134,77 @@ def llama_tokenize(
     return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
 
 
+_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
+_lib.llama_tokenize.restype = c_int
+
+
 def llama_n_vocab(ctx: llama_context_p) -> c_int:
     return _lib.llama_n_vocab(ctx)
 
 
+_lib.llama_n_vocab.argtypes = [llama_context_p]
+_lib.llama_n_vocab.restype = c_int
+
+
 def llama_n_ctx(ctx: llama_context_p) -> c_int:
     return _lib.llama_n_ctx(ctx)
 
 
+_lib.llama_n_ctx.argtypes = [llama_context_p]
+_lib.llama_n_ctx.restype = c_int
+
+
+# Token logits obtained from the last call to llama_eval()
+# The logits for the last token are stored in the last row
+# Can be mutated in order to change the probabilities of the next token
+# Rows: n_tokens
+# Cols: n_vocab
 def llama_get_logits(ctx: llama_context_p):
-    """Token logits obtained from the last call to llama_eval()
-    The logits for the last token are stored in the last row
-    Can be mutated in order to change the probabilities of the next token
-    Rows: n_tokens
-    Cols: n_vocab"""
     return _lib.llama_get_logits(ctx)
 
 
+_lib.llama_get_logits.argtypes = [llama_context_p]
+_lib.llama_get_logits.restype = POINTER(c_float)
+
+
+# Get the embeddings for the input
+# shape: [n_embd] (1-dimensional)
+def llama_get_embeddings(ctx: llama_context_p):
+    return _lib.llama_get_embeddings(ctx)
+
+
+_lib.llama_get_embeddings.argtypes = [llama_context_p]
+_lib.llama_get_embeddings.restype = POINTER(c_float)
+
+
+# Token Id -> String. Uses the vocabulary in the provided context
 def llama_token_to_str(ctx: llama_context_p, token: int) -> bytes:
-    """Token Id -> String. Uses the vocabulary in the provided context"""
     return _lib.llama_token_to_str(ctx, token)
 
 
+_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
+_lib.llama_token_to_str.restype = c_char_p
+
+# Special tokens
+
+
 def llama_token_bos() -> llama_token:
     return _lib.llama_token_bos()
 
 
+_lib.llama_token_bos.argtypes = []
+_lib.llama_token_bos.restype = llama_token
+
+
 def llama_token_eos() -> llama_token:
     return _lib.llama_token_eos()
 
 
+_lib.llama_token_eos.argtypes = []
+_lib.llama_token_eos.restype = llama_token
+
+
+# TODO: improve the last_n_tokens interface ?
 def llama_sample_top_p_top_k(
     ctx: llama_context_p,
     last_n_tokens_data: llama_token_p,
@@ -204,13 +219,41 @@ def llama_sample_top_p_top_k(
     )
 
 
+_lib.llama_sample_top_p_top_k.argtypes = [
+    llama_context_p,
+    llama_token_p,
+    c_int,
+    c_int,
+    c_double,
+    c_double,
+    c_double,
+]
+_lib.llama_sample_top_p_top_k.restype = llama_token
+
+
+# Performance information
+
+
 def llama_print_timings(ctx: llama_context_p):
     _lib.llama_print_timings(ctx)
 
 
+_lib.llama_print_timings.argtypes = [llama_context_p]
+_lib.llama_print_timings.restype = None
+
+
 def llama_reset_timings(ctx: llama_context_p):
     _lib.llama_reset_timings(ctx)
 
 
+_lib.llama_reset_timings.argtypes = [llama_context_p]
+_lib.llama_reset_timings.restype = None
+
+
+# Print system information
 def llama_print_system_info() -> bytes:
     return _lib.llama_print_system_info()
+
+
+_lib.llama_print_system_info.argtypes = []
+_lib.llama_print_system_info.restype = c_char_p

From bd1c657f80ffe6b8cf56a55a39b16eaa20e5a056 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 4 Apr 2023 22:36:59 -0400
Subject: [PATCH 03/77] Bugfix: wrong signature for quantize function

---
 examples/llama_cpp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 638f14238..f7149ed67 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -92,12 +92,12 @@ _lib.llama_free.restype = None
 # TODO: not great API - very likely to change
 # Returns 0 on success
 def llama_model_quantize(
-    fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int
+    fname_inp: bytes, fname_out: bytes, itype: c_int
 ) -> c_int:
-    return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk)
+    return _lib.llama_model_quantize(fname_inp, fname_out, itype)
 
 
-_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
+_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
 _lib.llama_model_quantize.restype = c_int
 
 

From a3da39af79cecb1dd94bf9e01f11d7a06a1b493e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 24 Mar 2023 18:43:29 -0400
Subject: [PATCH 04/77] Bugfix: cross-platform method to find shared lib

---
 examples/llama_cpp.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index f7149ed67..bafc40112 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -12,11 +12,15 @@ from ctypes import (
 )
 
 import pathlib
+from itertools import chain
 
 # Load the library
-libfile = pathlib.Path(__file__).parent / "libllama.so"
-_lib = ctypes.CDLL(str(libfile))
-
+# TODO: fragile, should fix
+_base_path = pathlib.Path(__file__).parent
+(_lib_path,) = chain(
+    _base_path.glob("*.so"), _base_path.glob("*.dylib"), _base_path.glob("*.dll")
+)
+_lib = ctypes.CDLL(str(_lib_path))
 
 # C types
 llama_context_p = c_void_p

From 019650f41628d72ee0cfb1448e0d259f22fccaff Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 31 Mar 2023 02:08:20 -0400
Subject: [PATCH 05/77] Fix array type signatures

---
 examples/llama_cpp.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index bafc40112..1e8054e5d 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -111,7 +111,7 @@ _lib.llama_model_quantize.restype = c_int
 # Returns 0 on success
 def llama_eval(
     ctx: llama_context_p,
-    tokens: llama_token_p,
+    tokens: ctypes.Array[llama_token],
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
@@ -131,7 +131,7 @@ _lib.llama_eval.restype = c_int
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
-    tokens: llama_token_p,
+    tokens: ctypes.Array[llama_token],
     n_max_tokens: c_int,
     add_bos: c_bool,
 ) -> c_int:
@@ -163,7 +163,7 @@ _lib.llama_n_ctx.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
-def llama_get_logits(ctx: llama_context_p):
+def llama_get_logits(ctx: llama_context_p) -> ctypes.Array[c_float]:
     return _lib.llama_get_logits(ctx)
 
 
@@ -173,7 +173,7 @@ _lib.llama_get_logits.restype = POINTER(c_float)
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
-def llama_get_embeddings(ctx: llama_context_p):
+def llama_get_embeddings(ctx: llama_context_p) -> ctypes.Array[c_float]:
     return _lib.llama_get_embeddings(ctx)
 
 
@@ -211,7 +211,7 @@ _lib.llama_token_eos.restype = llama_token
 # TODO: improve the last_n_tokens interface ?
 def llama_sample_top_p_top_k(
     ctx: llama_context_p,
-    last_n_tokens_data: llama_token_p,
+    last_n_tokens_data: ctypes.Array[llama_token],
     last_n_tokens_size: c_int,
     top_k: c_int,
     top_p: c_double,

From a7a6d88793deaf73629adffefc9e820dda5c52ef Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 31 Mar 2023 03:20:15 -0400
Subject: [PATCH 06/77] Fix ctypes typing issue for Arrays

---
 examples/llama_cpp.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 1e8054e5d..2a43ca328 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -9,8 +9,8 @@ from ctypes import (
     c_bool,
     POINTER,
     Structure,
+    Array
 )
-
 import pathlib
 from itertools import chain
 
@@ -111,7 +111,7 @@ _lib.llama_model_quantize.restype = c_int
 # Returns 0 on success
 def llama_eval(
     ctx: llama_context_p,
-    tokens: ctypes.Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
@@ -131,7 +131,7 @@ _lib.llama_eval.restype = c_int
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
-    tokens: ctypes.Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_max_tokens: c_int,
     add_bos: c_bool,
 ) -> c_int:
@@ -163,7 +163,7 @@ _lib.llama_n_ctx.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
-def llama_get_logits(ctx: llama_context_p) -> ctypes.Array[c_float]:
+def llama_get_logits(ctx: llama_context_p):
     return _lib.llama_get_logits(ctx)
 
 
@@ -173,7 +173,7 @@ _lib.llama_get_logits.restype = POINTER(c_float)
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
-def llama_get_embeddings(ctx: llama_context_p) -> ctypes.Array[c_float]:
+def llama_get_embeddings(ctx: llama_context_p):
     return _lib.llama_get_embeddings(ctx)
 
 
@@ -211,7 +211,7 @@ _lib.llama_token_eos.restype = llama_token
 # TODO: improve the last_n_tokens interface ?
 def llama_sample_top_p_top_k(
     ctx: llama_context_p,
-    last_n_tokens_data: ctypes.Array[llama_token],
+    last_n_tokens_data,  # type: Array[llama_token]
     last_n_tokens_size: c_int,
     top_k: c_int,
     top_p: c_double,

From 5bb1bc74d1764059a2bae937bddd8960d5e46e27 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 31 Mar 2023 03:25:12 -0400
Subject: [PATCH 07/77] Fix type signature of token_to_str

---
 examples/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 2a43ca328..214050855 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -182,7 +182,7 @@ _lib.llama_get_embeddings.restype = POINTER(c_float)
 
 
 # Token Id -> String. Uses the vocabulary in the provided context
-def llama_token_to_str(ctx: llama_context_p, token: int) -> bytes:
+def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
     return _lib.llama_token_to_str(ctx, token)
 
 

From def46dd9a68a6d6fb7818885efbd59e97175ec63 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 24 Mar 2023 18:57:25 -0400
Subject: [PATCH 08/77] Add example based on stripped down version of main.cpp
 from llama.cpp

---
 examples/low_level_api_llama_cpp.py | 85 +++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 examples/low_level_api_llama_cpp.py

diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py
new file mode 100644
index 000000000..4a888c355
--- /dev/null
+++ b/examples/low_level_api_llama_cpp.py
@@ -0,0 +1,85 @@
+import llama_cpp
+
+import multiprocessing
+
+import llama_cpp
+
+N_THREADS = multiprocessing.cpu_count()
+
+prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
+
+lparams = llama_cpp.llama_context_default_params()
+ctx = llama_cpp.llama_init_from_file(b"models/ggml-alpaca-7b-q4.bin", lparams)
+
+# determine the required inference memory per token:
+tmp = [0, 1, 2, 3]
+llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS)
+
+n_past = 0
+
+prompt = b" " + prompt
+
+embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
+n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True)
+embd_inp = embd_inp[:n_of_tok]
+
+n_ctx = llama_cpp.llama_n_ctx(ctx)
+
+n_predict = 20
+n_predict = min(n_predict, n_ctx - len(embd_inp))
+
+input_consumed = 0
+input_noecho = False
+
+remaining_tokens = n_predict
+
+embd = []
+last_n_size = 64
+last_n_tokens = [0] * last_n_size
+n_batch = 24
+
+while remaining_tokens > 0:
+    if len(embd) > 0:
+        llama_cpp.llama_eval(
+            ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS
+        )
+
+    n_past += len(embd)
+    embd = []
+    if len(embd_inp) <= input_consumed:
+        id = llama_cpp.llama_sample_top_p_top_k(
+            ctx,
+            (llama_cpp.c_int * len(last_n_tokens))(*last_n_tokens),
+            len(last_n_tokens),
+            40,
+            0.8,
+            0.2,
+            1.0 / 0.85,
+        )
+        last_n_tokens = last_n_tokens[1:] + [id]
+        embd.append(id)
+        input_noecho = False
+        remaining_tokens -= 1
+    else:
+        while len(embd_inp) > input_consumed:
+            embd.append(embd_inp[input_consumed])
+            last_n_tokens = last_n_tokens[1:] + [embd_inp[input_consumed]]
+            input_consumed += 1
+            if len(embd) >= n_batch:
+                break
+    if not input_noecho:
+        for id in embd:
+            print(
+                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"),
+                end="",
+                flush=True,
+            )
+
+    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos():
+        break
+
+print()
+
+llama_cpp.llama_print_timings(ctx)
+
+llama_cpp.llama_free(ctx)

From ef3c152257a357542be6a99eb6e44394fba01a70 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 25 Mar 2023 12:12:09 -0400
Subject: [PATCH 09/77] Update llama.cpp (llama_progress_callback)

---
 examples/llama_cpp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 214050855..b5f83baa2 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -40,6 +40,7 @@ class llama_token_data(Structure):
 
 llama_token_data_p = POINTER(llama_token_data)
 
+llama_progress_callback = ctypes.CFUNCTYPE(None, c_double, c_void_p)
 
 class llama_context_params(Structure):
     _fields_ = [
@@ -54,6 +55,10 @@ class llama_context_params(Structure):
         ("vocab_only", c_bool),  # only load the vocabulary, no weights
         ("use_mlock", c_bool),  # force system to keep model in RAM
         ("embedding", c_bool),  # embedding mode only
+        # called with a progress value between 0 and 1, pass NULL to disable
+        ("progress_callback", llama_progress_callback),
+        # context pointer passed to the progress callback
+        ("progress_callback_user_data", c_void_p),
     ]
 
 

From a279acd680db28d7fc00cf68f81ee45c2b9dd3ef Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 25 Mar 2023 16:26:03 -0400
Subject: [PATCH 10/77] Update llama.cpp (llama_n_embd)

---
 examples/llama_cpp.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index b5f83baa2..1862605b4 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -42,6 +42,7 @@ llama_token_data_p = POINTER(llama_token_data)
 
 llama_progress_callback = ctypes.CFUNCTYPE(None, c_double, c_void_p)
 
+
 class llama_context_params(Structure):
     _fields_ = [
         ("n_ctx", c_int),  # text context
@@ -163,6 +164,14 @@ _lib.llama_n_ctx.argtypes = [llama_context_p]
 _lib.llama_n_ctx.restype = c_int
 
 
+def llama_n_embd(ctx: llama_context_p) -> c_int:
+    return _lib.llama_n_ctx(ctx)
+
+
+_lib.llama_n_embd.argtypes = [llama_context_p]
+_lib.llama_n_embd.restype = c_int
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token

From a71cda6546661e233ece69cb02d6b43a07ddeeb4 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 28 Mar 2023 21:10:23 -0400
Subject: [PATCH 11/77] Update llama.cpp

---
 examples/llama_cpp.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 1862605b4..156139f71 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -3,7 +3,6 @@ import ctypes
 from ctypes import (
     c_int,
     c_float,
-    c_double,
     c_char_p,
     c_void_p,
     c_bool,
@@ -40,7 +39,7 @@ class llama_token_data(Structure):
 
 llama_token_data_p = POINTER(llama_token_data)
 
-llama_progress_callback = ctypes.CFUNCTYPE(None, c_double, c_void_p)
+llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 
 class llama_context_params(Structure):
@@ -228,9 +227,9 @@ def llama_sample_top_p_top_k(
     last_n_tokens_data,  # type: Array[llama_token]
     last_n_tokens_size: c_int,
     top_k: c_int,
-    top_p: c_double,
-    temp: c_double,
-    repeat_penalty: c_double,
+    top_p: c_float,
+    temp: c_float,
+    repeat_penalty: c_float,
 ) -> llama_token:
     return _lib.llama_sample_top_p_top_k(
         ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty
@@ -242,9 +241,9 @@ _lib.llama_sample_top_p_top_k.argtypes = [
     llama_token_p,
     c_int,
     c_int,
-    c_double,
-    c_double,
-    c_double,
+    c_float,
+    c_float,
+    c_float,
 ]
 _lib.llama_sample_top_p_top_k.restype = llama_token
 

From 62ce167b22580e4b697be2e31e4f61a53fd10475 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 1 Apr 2023 13:02:10 -0400
Subject: [PATCH 12/77] Update low level api example

---
 examples/llama_cpp.py               | 35 +++++++++++++++++++++++++++--
 examples/low_level_api_llama_cpp.py | 10 ++++-----
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 156139f71..03232560f 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -1,5 +1,4 @@
 import ctypes
-
 from ctypes import (
     c_int,
     c_float,
@@ -8,7 +7,9 @@ from ctypes import (
     c_bool,
     POINTER,
     Structure,
-    Array
+    Array,
+    c_uint8,
+    c_size_t
 )
 import pathlib
 from itertools import chain
@@ -109,6 +110,36 @@ def llama_model_quantize(
 _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
 _lib.llama_model_quantize.restype = c_int
 
+# Returns the KV cache that will contain the context for the
+# ongoing prediction with the model.
+def llama_get_kv_cache(ctx: llama_context_p):
+    return _lib.llama_get_kv_cache(ctx)
+
+_lib.llama_get_kv_cache.argtypes = [llama_context_p]
+_lib.llama_get_kv_cache.restype = POINTER(c_uint8)
+
+# Returns the size of the KV cache
+def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
+    return _lib.llama_get_kv_cache_size(ctx)
+
+_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
+_lib.llama_get_kv_cache_size.restype = c_size_t
+
+# Returns the number of tokens in the KV cache
+def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
+    return _lib.llama_get_kv_cache_token_count(ctx)
+
+_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
+_lib.llama_get_kv_cache_token_count.restype = c_int
+
+
+# Sets the KV cache containing the current context for the model
+def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
+    return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
+
+_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
+_lib.llama_set_kv_cache.restype = None
+
 
 # Run the llama inference to obtain the logits and probabilities for the next token.
 # tokens + n_tokens is the provided batch of new tokens to process
diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py
index 4a888c355..2a639aad5 100644
--- a/examples/low_level_api_llama_cpp.py
+++ b/examples/low_level_api_llama_cpp.py
@@ -35,7 +35,7 @@ remaining_tokens = n_predict
 
 embd = []
 last_n_size = 64
-last_n_tokens = [0] * last_n_size
+last_n_tokens_data = [0] * last_n_size
 n_batch = 24
 
 while remaining_tokens > 0:
@@ -49,21 +49,21 @@ while remaining_tokens > 0:
     if len(embd_inp) <= input_consumed:
         id = llama_cpp.llama_sample_top_p_top_k(
             ctx,
-            (llama_cpp.c_int * len(last_n_tokens))(*last_n_tokens),
-            len(last_n_tokens),
+            (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data),
+            len(last_n_tokens_data),
             40,
             0.8,
             0.2,
             1.0 / 0.85,
         )
-        last_n_tokens = last_n_tokens[1:] + [id]
+        last_n_tokens_data = last_n_tokens_data[1:] + [id]
         embd.append(id)
         input_noecho = False
         remaining_tokens -= 1
     else:
         while len(embd_inp) > input_consumed:
             embd.append(embd_inp[input_consumed])
-            last_n_tokens = last_n_tokens[1:] + [embd_inp[input_consumed]]
+            last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
             input_consumed += 1
             if len(embd) >= n_batch:
                 break

From 2b8147e7a8881d91ec7da933262074101b44e30f Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:50:13 -0400
Subject: [PATCH 13/77] Update llama_cpp.py

---
 examples/llama_cpp.py | 48 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 03232560f..fe9a8934b 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -1,3 +1,5 @@
+import sys
+import os
 import ctypes
 from ctypes import (
     c_int,
@@ -12,15 +14,47 @@ from ctypes import (
     c_size_t
 )
 import pathlib
-from itertools import chain
 
 # Load the library
-# TODO: fragile, should fix
-_base_path = pathlib.Path(__file__).parent
-(_lib_path,) = chain(
-    _base_path.glob("*.so"), _base_path.glob("*.dylib"), _base_path.glob("*.dll")
-)
-_lib = ctypes.CDLL(str(_lib_path))
+def _load_shared_library(lib_base_name):
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux"):
+        lib_ext = ".so"
+    elif sys.platform == "darwin":
+        lib_ext = ".dylib"
+    elif sys.platform == "win32":
+        lib_ext = ".dll"
+    else:
+        raise RuntimeError("Unsupported platform")
+
+    # Construct the paths to the possible shared library names
+    _base_path = pathlib.Path(__file__).parent.resolve()
+    # Searching for the library in the current directory under the name "libllama" (default name
+    # for llamacpp) and "llama" (default name for this repo)
+    _lib_paths = [
+        _base_path / f"lib{lib_base_name}{lib_ext}",
+        _base_path / f"{lib_base_name}{lib_ext}"
+    ]
+
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(_base_path))
+
+    # Try to load the shared library, handling potential errors
+    for _lib_path in _lib_paths:
+        if _lib_path.exists():
+            try:
+                return ctypes.CDLL(str(_lib_path))
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
+
+    raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found")
+
+# Specify the base name of the shared library to load
+_lib_base_name = "llama"
+
+# Load the library
+_lib = _load_shared_library(_lib_base_name)
 
 # C types
 llama_context_p = c_void_p

From 15bea0946b890a8e69deb739d790318ab4600ba8 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 3 Apr 2023 22:54:46 +0200
Subject: [PATCH 14/77] Chat llama.cpp example implementation

---
 examples/low_level_api_chat_cpp.py | 235 +++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 examples/low_level_api_chat_cpp.py

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
new file mode 100644
index 000000000..ec9703d1f
--- /dev/null
+++ b/examples/low_level_api_chat_cpp.py
@@ -0,0 +1,235 @@
+"""
+This is an example implementation of main.cpp from llama.cpp
+Quirks:
+ * Its not exactly alike since this port is designed around programmatic I/O
+ * Input is always echoed if on, so it should be turned off when using "input()"
+ * The first antiprompt should be the userprompt like "\nUser:",
+   because its added when n_predict is reached (aka generation ended prematurely)
+ * n_predict can be set to -1 for unlimited length responses
+"""
+import llama_cpp
+
+def toIntArray(lst):
+    return [int(i) for i in lst]
+
+# A LLaMA interactive session
+class LLaMAInteract:
+    def __init__(self,
+        primer: str="",
+        model: str="./models/30B/ggml-model-q4_0.bin",
+        n_ctx: int=1024,
+        seed: int=0,
+        n_threads: int=8,
+        antiprompt: list[str]=[],
+        input_echo: bool=True,
+        n_predict: int=20,
+        n_batch: int=8,
+        repeat_last_n: int=64,
+        top_k: int=50,
+        top_p: float=1.,
+        temp: float=1.0,
+        repeat_penalty: float=1,
+    ) -> None:
+        # input args
+        self.n_threads = n_threads
+        self.input_echo = input_echo
+        self.n_predict = n_predict
+        self.n_batch = n_batch
+        self.repeat_last_n = repeat_last_n
+        self.top_k=top_k
+        self.top_p=top_p
+        self.temp=temp
+        self.repeat_penalty=repeat_penalty
+        self.n_ctx = n_ctx
+        self.seed = seed
+
+        # runtime args
+        self.input_consumed = 0
+        self.embd = []
+        self.embd_inp = []
+        self.n_past = 0
+        self.first_antiprompt = []
+        self.remaining_tokens = self.n_predict
+        self.output_echo = input_echo
+
+        # model load
+        self.lparams = llama_cpp.llama_context_default_params()
+        self.lparams.n_ctx = self.n_ctx
+        self.lparams.seed = self.seed
+        self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams)
+
+        # determine the required inference memory per token:
+        tmp = [0, 1, 2, 3]
+        llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
+
+        # determine newline token
+        self.llama_token_newline = (llama_cpp.llama_token * 1)()
+        llama_cpp.llama_tokenize(self.ctx, b"\n", self.llama_token_newline, len(self.llama_token_newline), False)
+        self.llama_token_newline = toIntArray(self.llama_token_newline)
+
+        # primer feed
+        if (len(primer) > 0):
+            self.input(primer)
+        self.n_keep = len(self.embd_inp)
+
+        # create internal context
+        self.n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
+        self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
+
+        # determine antiprompt tokens
+        for i in antiprompt:
+            d_antiprompt = (llama_cpp.llama_token * (len(i) + 1))()
+            n_antiprompt = llama_cpp.llama_tokenize(self.ctx, i.encode("utf8"), d_antiprompt, len(d_antiprompt), False)
+            self.first_antiprompt.append(toIntArray(d_antiprompt[:n_antiprompt]))
+
+    # if an antiprompt is present
+    def use_antiprompt(self):
+        return len(self.first_antiprompt) > 0
+
+    def generate(self):
+        while self.remaining_tokens > 0 or self.use_antiprompt():
+            # predict
+            if len(self.embd) > 0:
+                # infinite text generation via context swapping
+                # if we run out of context:
+                # - take the n_keep first tokens from the original prompt (via n_past)
+                # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+                if (self.n_past + len(self.embd) > self.n_ctx):
+                    n_left = self.n_past - self.n_keep
+                    self.n_past = self.n_keep
+
+                    # insert n_left/2 tokens at the start of embd from last_n_tokens
+                    _insert = self.last_n_tokens[
+                        -(int(n_left/2) - len(self.embd)):-len(self.embd)
+                    ]
+                    self.embd[:len(_insert)] = _insert
+                    #TODO: Still untested
+
+                if (llama_cpp.llama_eval(
+                    self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads
+                ) != 0):
+                    raise Exception("Failed to llama_eval!")
+
+            self.n_past += len(self.embd)
+            self.embd = []
+            if len(self.embd_inp) <= self.input_consumed:
+                # out of user input, sample next token
+                _arr = self.last_n_tokens[-min(self.repeat_last_n, self.n_past):]
+                id = llama_cpp.llama_sample_top_p_top_k(
+                    self.ctx,
+                    (llama_cpp.llama_token * len(_arr))(*_arr),
+                    len(_arr),
+                    self.top_k,
+                    self.top_p,
+                    self.temp,
+                    self.repeat_penalty,
+                )
+                self.last_n_tokens.pop(0)
+                self.last_n_tokens.append(int(id))
+
+                # replace end of text token with newline token when in interactive mode
+                if (id == llama_cpp.llama_token_eos() and self.use_antiprompt()):
+                    id = self.llama_token_newline[0]
+                    # tokenize and inject first reverse prompt
+                    self.embd_inp += self.first_antiprompt[0]
+
+                # add it to the context
+                self.embd.append(int(id))
+
+                # echo this to console
+                self.output_echo = True
+
+                # decrement remaining sampling budget
+                self.remaining_tokens -= 1
+            else:
+                # output to console if input echo is on
+                self.output_echo = self.input_echo
+
+                # some user input remains from prompt or interaction, forward it to processing
+                while len(self.embd_inp) > self.input_consumed:
+                    self.embd.append(int(self.embd_inp[self.input_consumed]))
+                    self.last_n_tokens.pop(0)
+                    self.last_n_tokens.append(int(self.embd_inp[self.input_consumed]))
+                    self.input_consumed += 1
+                    if len(self.embd) >= self.n_batch:
+                        break
+
+            # display tokens
+            if self.output_echo:
+                for id in self.embd:
+                    yield id
+
+            # if antiprompt is present, stop
+            if (self.use_antiprompt() and len(self.embd_inp) <= self.input_consumed):
+                for i in self.first_antiprompt:
+                    if i == self.last_n_tokens[-len(i):]:
+                        return
+
+            # if end of generation
+            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
+                break
+
+            # respect n_predict even if antiprompt is present
+            if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1):
+                self.embd_inp += self.first_antiprompt[0]
+                break
+
+    def past(self):
+        for id in self.last_n_tokens[-self.n_past:]:
+            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+
+    def input(self, prompt: str):
+        embd_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
+        n_of_tok = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), embd_arr, len(embd_arr), True)
+        self.embd_inp += toIntArray(embd_arr[:n_of_tok])
+
+    def output(self):
+        self.remaining_tokens = self.n_predict
+        for id in self.generate():
+            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+
+if __name__ == "__main__":
+    from datetime import datetime
+
+    USER_NAME="User"
+    AI_NAME="ChatLLaMa"
+
+    time_now = datetime.now()
+    prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+{USER_NAME}: Hello, {AI_NAME}!
+{AI_NAME}: Hello {USER_NAME}! How may I help you today?
+{USER_NAME}: What time is it?
+{AI_NAME}: It is {time_now.strftime("%H:%M")}.
+{USER_NAME}: What year is it?
+{AI_NAME}: We are in {time_now.strftime("%Y")}.
+{USER_NAME}: What is a cat?
+{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{USER_NAME}: Name a color.
+{AI_NAME}: Blue
+{USER_NAME}:"""
+
+    print("Loading model...")
+    ll = LLaMAInteract(prompt,
+        model="./models/30B/ggml-model-q4_0.bin",
+        n_ctx=2048,
+        antiprompt=[f"\n{USER_NAME}:"],
+        repeat_last_n=256,
+        n_predict=2048,
+        temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647
+    )
+    print("Loaded model!")
+
+    for i in ll.output():
+        print(i,end="",flush=True)
+    ll.input_echo = False
+
+    inp = lambda x: f" {x}\n"
+    while True:
+        ll.input(inp(input(' ')))
+        for i in ll.output():
+            print(i,end="",flush=True)
\ No newline at end of file

From 9e872410dae603d15b10cdf33fe62e9d51114c16 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 11:48:48 +0200
Subject: [PATCH 15/77] Add instruction mode

---
 examples/low_level_api_chat_cpp.py | 99 +++++++++++++++++++-----------
 1 file changed, 63 insertions(+), 36 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index ec9703d1f..8d4e8b692 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -5,24 +5,26 @@ Quirks:
  * Input is always echoed if on, so it should be turned off when using "input()"
  * The first antiprompt should be the userprompt like "\nUser:",
    because its added when n_predict is reached (aka generation ended prematurely)
- * n_predict can be set to -1 for unlimited length responses
+ * n_predict can be set to -1 for unlimited length responses (or just a really high value)
+ * It's always in interactive mode, generation ends either by reaching an antiprompt
+   or running out of n_predict.
+ * Instruction mode adds its own antiprompt
 """
 import llama_cpp
 
-def toIntArray(lst):
-    return [int(i) for i in lst]
-
 # A LLaMA interactive session
 class LLaMAInteract:
     def __init__(self,
         primer: str="",
         model: str="./models/30B/ggml-model-q4_0.bin",
+        instruct: bool=False,
         n_ctx: int=1024,
         seed: int=0,
         n_threads: int=8,
         antiprompt: list[str]=[],
         input_echo: bool=True,
         n_predict: int=20,
+        n_keep: int=0,
         n_batch: int=8,
         repeat_last_n: int=64,
         top_k: int=50,
@@ -31,17 +33,17 @@ class LLaMAInteract:
         repeat_penalty: float=1,
     ) -> None:
         # input args
+        self.instruct = instruct
         self.n_threads = n_threads
         self.input_echo = input_echo
         self.n_predict = n_predict
+        self.n_keep = n_keep
         self.n_batch = n_batch
         self.repeat_last_n = repeat_last_n
         self.top_k=top_k
         self.top_p=top_p
         self.temp=temp
         self.repeat_penalty=repeat_penalty
-        self.n_ctx = n_ctx
-        self.seed = seed
 
         # runtime args
         self.input_consumed = 0
@@ -54,8 +56,8 @@ class LLaMAInteract:
 
         # model load
         self.lparams = llama_cpp.llama_context_default_params()
-        self.lparams.n_ctx = self.n_ctx
-        self.lparams.seed = self.seed
+        self.lparams.n_ctx = n_ctx
+        self.lparams.seed = seed
         self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams)
 
         # determine the required inference memory per token:
@@ -63,29 +65,44 @@ class LLaMAInteract:
         llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
 
         # determine newline token
-        self.llama_token_newline = (llama_cpp.llama_token * 1)()
-        llama_cpp.llama_tokenize(self.ctx, b"\n", self.llama_token_newline, len(self.llama_token_newline), False)
-        self.llama_token_newline = toIntArray(self.llama_token_newline)
+        self.llama_token_newline = self._tokenize("\n", False)
+        self.inp_prefix = self._tokenize("\n\n### Instruction:\n\n")
+        self.inp_suffix = self._tokenize("\n\n### Response:\n\n", False)
+
+        # add instruction as antiprompt
+        if (self.instruct):
+            self.first_antiprompt.append(self.inp_prefix)
 
         # primer feed
         if (len(primer) > 0):
-            self.input(primer)
-        self.n_keep = len(self.embd_inp)
+            self.embd_inp += self._tokenize(primer)
+
+        # break immediately if using instruct
+        self.init_break = self.instruct
+
+        # number of tokens to keep when resetting context
+        if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct):
+            self.n_keep = len(self.embd_inp)
 
         # create internal context
-        self.n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
+        self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
         self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
 
         # determine antiprompt tokens
         for i in antiprompt:
-            d_antiprompt = (llama_cpp.llama_token * (len(i) + 1))()
-            n_antiprompt = llama_cpp.llama_tokenize(self.ctx, i.encode("utf8"), d_antiprompt, len(d_antiprompt), False)
-            self.first_antiprompt.append(toIntArray(d_antiprompt[:n_antiprompt]))
+            self.first_antiprompt.append(self._tokenize(i, False))
+
+    # tokenize a prompt
+    def _tokenize(self, prompt, bos=True):
+        _arr = (llama_cpp.llama_token * (len(prompt) + 1))()
+        _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
+        return _arr[:_n]
 
     # if an antiprompt is present
     def use_antiprompt(self):
         return len(self.first_antiprompt) > 0
 
+    # generate tokens
     def generate(self):
         while self.remaining_tokens > 0 or self.use_antiprompt():
             # predict
@@ -125,16 +142,16 @@ class LLaMAInteract:
                     self.repeat_penalty,
                 )
                 self.last_n_tokens.pop(0)
-                self.last_n_tokens.append(int(id))
+                self.last_n_tokens.append(id)
 
                 # replace end of text token with newline token when in interactive mode
-                if (id == llama_cpp.llama_token_eos() and self.use_antiprompt()):
+                if (id == llama_cpp.llama_token_eos() and self.use_antiprompt() and not self.instruct):
                     id = self.llama_token_newline[0]
                     # tokenize and inject first reverse prompt
                     self.embd_inp += self.first_antiprompt[0]
 
                 # add it to the context
-                self.embd.append(int(id))
+                self.embd.append(id)
 
                 # echo this to console
                 self.output_echo = True
@@ -147,9 +164,9 @@ class LLaMAInteract:
 
                 # some user input remains from prompt or interaction, forward it to processing
                 while len(self.embd_inp) > self.input_consumed:
-                    self.embd.append(int(self.embd_inp[self.input_consumed]))
+                    self.embd.append(self.embd_inp[self.input_consumed])
                     self.last_n_tokens.pop(0)
-                    self.last_n_tokens.append(int(self.embd_inp[self.input_consumed]))
+                    self.last_n_tokens.append(self.embd_inp[self.input_consumed])
                     self.input_consumed += 1
                     if len(self.embd) >= self.n_batch:
                         break
@@ -159,11 +176,17 @@ class LLaMAInteract:
                 for id in self.embd:
                     yield id
 
-            # if antiprompt is present, stop
-            if (self.use_antiprompt() and len(self.embd_inp) <= self.input_consumed):
-                for i in self.first_antiprompt:
-                    if i == self.last_n_tokens[-len(i):]:
-                        return
+            if (len(self.embd_inp) <= self.input_consumed):
+                # if antiprompt is present, stop
+                if (self.use_antiprompt()):
+                    for i in self.first_antiprompt:
+                        if i == self.last_n_tokens[-len(i):]:
+                            return
+
+                # if we are using instruction mode, and we have processed the initial prompt
+                if (self.init_break):
+                    self.init_break = False
+                    break
 
             # if end of generation
             if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
@@ -174,15 +197,20 @@ class LLaMAInteract:
                 self.embd_inp += self.first_antiprompt[0]
                 break
 
+    # return past text
     def past(self):
         for id in self.last_n_tokens[-self.n_past:]:
             yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
 
+    # write input
     def input(self, prompt: str):
-        embd_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
-        n_of_tok = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), embd_arr, len(embd_arr), True)
-        self.embd_inp += toIntArray(embd_arr[:n_of_tok])
+        if (self.instruct):
+            self.embd_inp += self.inp_prefix
+        self.embd_inp += self._tokenize(prompt + "\n")
+        if (self.instruct):
+            self.embd_inp += self.inp_suffix
 
+    # write output
     def output(self):
         self.remaining_tokens = self.n_predict
         for id in self.generate():
@@ -214,7 +242,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}:"""
 
     print("Loading model...")
-    ll = LLaMAInteract(prompt,
+    m = LLaMAInteract(prompt,
         model="./models/30B/ggml-model-q4_0.bin",
         n_ctx=2048,
         antiprompt=[f"\n{USER_NAME}:"],
@@ -224,12 +252,11 @@ The transcript only includes text, it does not include markup like HTML and Mark
     )
     print("Loaded model!")
 
-    for i in ll.output():
+    for i in m.output():
         print(i,end="",flush=True)
-    ll.input_echo = False
+    m.input_echo = False
 
-    inp = lambda x: f" {x}\n"
     while True:
-        ll.input(inp(input(' ')))
-        for i in ll.output():
+        m.input(" " + input('\n> ' if m.instruct else " "))
+        for i in m.output():
             print(i,end="",flush=True)
\ No newline at end of file

From 0bfad75406c8204a95a6bcc982d8ca351c9bbd7a Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 16:18:26 +0200
Subject: [PATCH 16/77] Added instruction mode, fixed infinite generation, and
 various other fixes

---
 examples/low_level_api_chat_cpp.py | 62 +++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 8d4e8b692..45a6262d4 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -8,7 +8,9 @@ Quirks:
  * n_predict can be set to -1 for unlimited length responses (or just a really high value)
  * It's always in interactive mode, generation ends either by reaching an antiprompt
    or running out of n_predict.
- * Instruction mode adds its own antiprompt
+ * Instruction mode adds its own antiprompt.
+   You should also still be feeding the model with a "primer" prompt that
+   shows it the expected format.
 """
 import llama_cpp
 
@@ -31,6 +33,8 @@ class LLaMAInteract:
         top_p: float=1.,
         temp: float=1.0,
         repeat_penalty: float=1,
+        instruct_inp_prefix: str="\n\n### Instruction:\n\n",
+        instruct_inp_suffix: str="\n\n### Response:\n\n",
     ) -> None:
         # input args
         self.instruct = instruct
@@ -66,12 +70,12 @@ class LLaMAInteract:
 
         # determine newline token
         self.llama_token_newline = self._tokenize("\n", False)
-        self.inp_prefix = self._tokenize("\n\n### Instruction:\n\n")
-        self.inp_suffix = self._tokenize("\n\n### Response:\n\n", False)
+        self.inp_prefix = self._tokenize(instruct_inp_prefix)
+        self.inp_suffix = self._tokenize(instruct_inp_suffix, False)
 
         # add instruction as antiprompt
         if (self.instruct):
-            self.first_antiprompt.append(self.inp_prefix)
+            self.first_antiprompt.append(self.inp_prefix.strip())
 
         # primer feed
         if (len(primer) > 0):
@@ -117,10 +121,9 @@ class LLaMAInteract:
 
                     # insert n_left/2 tokens at the start of embd from last_n_tokens
                     _insert = self.last_n_tokens[
-                        -(int(n_left/2) - len(self.embd)):-len(self.embd)
+                        self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
                     ]
-                    self.embd[:len(_insert)] = _insert
-                    #TODO: Still untested
+                    self.embd = _insert + self.embd
 
                 if (llama_cpp.llama_eval(
                     self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads
@@ -197,6 +200,12 @@ class LLaMAInteract:
                 self.embd_inp += self.first_antiprompt[0]
                 break
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, tb):
+        llama_cpp.llama_free(self.ctx)
+
     # return past text
     def past(self):
         for id in self.last_n_tokens[-self.n_past:]:
@@ -206,7 +215,7 @@ class LLaMAInteract:
     def input(self, prompt: str):
         if (self.instruct):
             self.embd_inp += self.inp_prefix
-        self.embd_inp += self._tokenize(prompt + "\n")
+        self.embd_inp += self._tokenize(prompt)
         if (self.instruct):
             self.embd_inp += self.inp_suffix
 
@@ -242,21 +251,38 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}:"""
 
     print("Loading model...")
-    m = LLaMAInteract(prompt,
+    with LLaMAInteract(prompt,
         model="./models/30B/ggml-model-q4_0.bin",
         n_ctx=2048,
         antiprompt=[f"\n{USER_NAME}:"],
         repeat_last_n=256,
         n_predict=2048,
         temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647
-    )
-    print("Loaded model!")
+    ) as m:
+        print("Loaded model!")
 
-    for i in m.output():
-        print(i,end="",flush=True)
-    m.input_echo = False
-
-    while True:
-        m.input(" " + input('\n> ' if m.instruct else " "))
         for i in m.output():
-            print(i,end="",flush=True)
\ No newline at end of file
+            print(i,end="",flush=True)
+        m.input_echo = False
+
+        def inp():
+            out = ""
+            while (t := input()).endswith("\\"):
+                out += t[:-1] + "\n"
+            return out + t + "\n"
+
+        while True:
+            if (m.instruct):
+                print('\n> ', end="")
+                m.input(inp())
+            else:
+                print(f" ", end="")
+                m.input(f" {inp()}{AI_NAME}:")
+                print(f"{AI_NAME}: ",end="")
+
+            try:
+                for i in m.output():
+                    print(i,end="",flush=True)
+            except KeyboardInterrupt:
+                print(f"\n{USER_NAME}:",end="")
+                m.input(f"\n{USER_NAME}:")

From 3c1020b86697d9c3e0cfed8f10d4b4300ebe5d84 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 16:20:27 +0200
Subject: [PATCH 17/77] Fix stripping instruction prompt

---
 examples/low_level_api_chat_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 45a6262d4..947be4ad0 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -75,7 +75,7 @@ class LLaMAInteract:
 
         # add instruction as antiprompt
         if (self.instruct):
-            self.first_antiprompt.append(self.inp_prefix.strip())
+            self.first_antiprompt.append(self._tokenize(self.inp_prefix.strip()))
 
         # primer feed
         if (len(primer) > 0):

From ae1f37f505d7e9061205394493f71f45a36712ea Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 17:54:47 +0200
Subject: [PATCH 18/77] Fix repeating instructions and an antiprompt bug

---
 examples/low_level_api_chat_cpp.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 947be4ad0..eec2ff665 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -75,7 +75,7 @@ class LLaMAInteract:
 
         # add instruction as antiprompt
         if (self.instruct):
-            self.first_antiprompt.append(self._tokenize(self.inp_prefix.strip()))
+            self.first_antiprompt.append(self._tokenize(instruct_inp_prefix.strip(), False))
 
         # primer feed
         if (len(primer) > 0):
@@ -197,7 +197,8 @@ class LLaMAInteract:
 
             # respect n_predict even if antiprompt is present
             if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1):
-                self.embd_inp += self.first_antiprompt[0]
+                if not self.instruct:
+                    self.embd_inp += self.first_antiprompt[0]
                 break
 
     def __enter__(self):
@@ -213,7 +214,7 @@ class LLaMAInteract:
 
     # write input
     def input(self, prompt: str):
-        if (self.instruct):
+        if (self.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
             self.embd_inp += self.inp_prefix
         self.embd_inp += self._tokenize(prompt)
         if (self.instruct):
@@ -284,5 +285,6 @@ The transcript only includes text, it does not include markup like HTML and Mark
                 for i in m.output():
                     print(i,end="",flush=True)
             except KeyboardInterrupt:
-                print(f"\n{USER_NAME}:",end="")
-                m.input(f"\n{USER_NAME}:")
+                if not m.instruct:
+                    print(f"\n{USER_NAME}:",end="")
+                    m.input(f"\n{USER_NAME}:")

From 739e8d4c9bc268d556217c1a4b07818b542ac041 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Wed, 5 Apr 2023 14:47:24 +0200
Subject: [PATCH 19/77] Fix bug in init_break not being set when exited via
 antiprompt and others.

---
 examples/low_level_api_chat_cpp.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index eec2ff665..6003e0c62 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -33,6 +33,7 @@ class LLaMAInteract:
         top_p: float=1.,
         temp: float=1.0,
         repeat_penalty: float=1,
+        init_break: bool=True,
         instruct_inp_prefix: str="\n\n### Instruction:\n\n",
         instruct_inp_suffix: str="\n\n### Response:\n\n",
     ) -> None:
@@ -48,6 +49,7 @@ class LLaMAInteract:
         self.top_p=top_p
         self.temp=temp
         self.repeat_penalty=repeat_penalty
+        self.init_break = init_break
 
         # runtime args
         self.input_consumed = 0
@@ -81,9 +83,6 @@ class LLaMAInteract:
         if (len(primer) > 0):
             self.embd_inp += self._tokenize(primer)
 
-        # break immediately if using instruct
-        self.init_break = self.instruct
-
         # number of tokens to keep when resetting context
         if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct):
             self.n_keep = len(self.embd_inp)
@@ -182,13 +181,14 @@ class LLaMAInteract:
             if (len(self.embd_inp) <= self.input_consumed):
                 # if antiprompt is present, stop
                 if (self.use_antiprompt()):
-                    for i in self.first_antiprompt:
-                        if i == self.last_n_tokens[-len(i):]:
-                            return
+                    if True in [
+                        i == self.last_n_tokens[-len(i):]
+                        for i in self.first_antiprompt
+                    ]:
+                        break
 
                 # if we are using instruction mode, and we have processed the initial prompt
                 if (self.init_break):
-                    self.init_break = False
                     break
 
             # if end of generation
@@ -201,6 +201,8 @@ class LLaMAInteract:
                     self.embd_inp += self.first_antiprompt[0]
                 break
 
+        self.init_break = False
+
     def __enter__(self):
         return self
 

From ce66405da184891a8f03e7e2d908bd4ee2926efe Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 5 Apr 2023 04:17:26 -0400
Subject: [PATCH 20/77] Add quantize example

---
 examples/quantize.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 examples/quantize.py

diff --git a/examples/quantize.py b/examples/quantize.py
new file mode 100644
index 000000000..8bd03f88a
--- /dev/null
+++ b/examples/quantize.py
@@ -0,0 +1,25 @@
+import os
+import argparse
+import llama_cpp
+
+
+def main(args):
+    if not os.path.exists(fname_inp):
+        raise RuntimeError(f"Input file does not exist ({fname_inp})")
+    if os.path.exists(fname_out):
+        raise RuntimeError(f"Output file already exists ({fname_out})")
+    fname_inp = args.fname_inp.encode("utf-8")
+    fname_out = args.fname_out.encode("utf-8")
+    itype = args.itype
+    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
+    if return_code != 0:
+        raise RuntimeError("Failed to quantize model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("fname_inp", type=str, help="Path to input model")
+    parser.add_argument("fname_out", type=str, help="Path to output model")
+    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
+    args = parser.parse_args()
+    main(args)

From 29e9fb66a3a09c0e744e6e82ab370ca509a90645 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Thu, 6 Apr 2023 15:30:57 +0200
Subject: [PATCH 21/77] Better llama.cpp interoperability

Has some too many newline issues so WIP

(Update) Fixed too many newlines, now onto args.

Still needs shipping work so you could do "python -m llama_cpp.examples." etc.
---
 examples/common.py                 | 135 ++++++++++++
 examples/low_level_api_chat_cpp.py | 342 +++++++++++++++++++----------
 2 files changed, 357 insertions(+), 120 deletions(-)
 create mode 100644 examples/common.py

diff --git a/examples/common.py b/examples/common.py
new file mode 100644
index 000000000..f80d995c5
--- /dev/null
+++ b/examples/common.py
@@ -0,0 +1,135 @@
+import os
+import argparse
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
+
+
+@dataclass
+class GptParams:
+    seed: int = -1
+    n_threads: int = min(4, os.cpu_count() or 1)
+    n_predict: int = 128
+    repeat_last_n: int = 64
+    n_parts: int = -1
+    n_ctx: int = 512
+    n_batch: int = 8
+    n_keep: int = 0
+
+    top_k: int = 40
+    top_p: float = 0.95
+    temp: float = 0.80
+    repeat_penalty: float = 1.10
+
+    model: str = "./models/llama-7B/ggml-model.bin"
+    prompt: str = ""
+    input_prefix: str = " "
+    fix_prefix: str = ""
+    output_postfix: str = ""
+    input_echo: bool = True,
+
+    antiprompt: List[str] = field(default_factory=list)
+
+    memory_f16: bool = True
+    random_prompt: bool = False
+    use_color: bool = False
+    interactive: bool = False
+
+    embedding: bool = False
+    interactive_start: bool = False
+
+    instruct: bool = False
+    ignore_eos: bool = False
+    perplexity: bool = False
+    use_mlock: bool = False
+    mem_test: bool = False
+    verbose_prompt: bool = False
+
+    # Default instructions for Alpaca
+    # switch to "Human" and "Assistant" for Vicuna.
+    instruct_inp_prefix: str="\n\n### Instruction:\n\n",
+    instruct_inp_suffix: str="\n\n### Response:\n\n",
+
+
+def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
+    if params is None:
+        params = GptParams()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-h", "--help", action="store_true", help="show this help message and exit")
+    parser.add_argument("-s", "--seed", type=int, default=-1, help="",dest="seed")
+    parser.add_argument("-t", "--threads", type=int, default=1, help="",dest="n_threads")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="")
+    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="",dest="n_ctx")
+    parser.add_argument("--memory_f32", action="store_false", help="",dest="memory_f16")
+    parser.add_argument("--top_p", type=float, default=0.9, help="",dest="top_p")
+    parser.add_argument("--temp", type=float, default=1.0, help="",dest="temp")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="",dest="repeat_last_n")
+    parser.add_argument("--repeat_penalty", type=float, default=1.0, help="",dest="repeat_penalty")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="",dest="n_batch")
+    parser.add_argument("--keep", type=int, default=0, help="",dest="n_keep")
+    parser.add_argument("-m", "--model", type=str, help="",dest="model")
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
+    )
+    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
+    parser.add_argument("--interactive-start", action="store_true", help="", dest="interactive_start")
+    parser.add_argument(
+        "--interactive-first",
+        action="store_true",
+        help="run in interactive mode and wait for input right away",
+        dest="interactive"
+    )
+    parser.add_argument(
+        "-ins",
+        "--instruct",
+        action="store_true",
+        help="run in instruction mode (use with Alpaca or Vicuna models)",
+        dest="instruct"
+    )
+    parser.add_argument(
+        "--color",
+        action="store_true",
+        help="colorise output to distinguish prompt and user input from generations",
+        dest="use_color"
+    )
+    parser.add_argument("--mlock", action="store_true",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",dest="mem_test")
+    parser.add_argument(
+        "-r",
+        "--reverse-prompt",
+        type=str,
+        action='append',
+        help="run in interactive mode and poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
+        dest="antiprompt"
+    )
+    parser.add_argument("--perplexity", action="store_true", help="", dest="perplexity")
+    parser.add_argument("--ignore-eos", action="store_true", help="", dest="ignore_eos")
+    parser.add_argument("--n_parts", type=int, default=-1, help="", dest="n_parts")
+    parser.add_argument("--random-prompt", action="store_true", help="", dest="random_prompt")
+    parser.add_argument("--in-prefix", type=str, default=" ", help="", dest="input_prefix")
+    parser.add_argument("--fix-prefix", type=str, default=" ", help="", dest="fix_prefix")
+    parser.add_argument("--out-postfix", type=str, default="", help="", dest="output_postfix")
+    parser.add_argument("--input-noecho", action="store_false", help="", dest="input_echo")
+    args = parser.parse_args(argv)
+    return args
+
+def gpt_random_prompt(rng):
+    return [
+        "So",
+        "Once upon a time",
+        "When",
+        "The",
+        "After",
+        "If",
+        "import",
+        "He",
+        "She",
+        "They",
+    ][rng % 10]
+
+if __name__ == "__main__":
+    print(GptParams(gpt_params_parse()))
diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 6003e0c62..e7370c01f 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -12,102 +12,182 @@ Quirks:
    You should also still be feeding the model with a "primer" prompt that
    shows it the expected format.
 """
+import sys
+from time import time
+from os import cpu_count
+
 import llama_cpp
+from common import GptParams, gpt_params_parse, gpt_random_prompt
+
+ANSI_COLOR_RESET = "\x1b[0m"
+ANSI_COLOR_YELLOW = "\x1b[33m"
+ANSI_BOLD = "\x1b[1m"
+ANSI_COLOR_GREEN = "\x1b[32m"
+
+CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
+CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
+CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
 
 # A LLaMA interactive session
 class LLaMAInteract:
-    def __init__(self,
-        primer: str="",
-        model: str="./models/30B/ggml-model-q4_0.bin",
-        instruct: bool=False,
-        n_ctx: int=1024,
-        seed: int=0,
-        n_threads: int=8,
-        antiprompt: list[str]=[],
-        input_echo: bool=True,
-        n_predict: int=20,
-        n_keep: int=0,
-        n_batch: int=8,
-        repeat_last_n: int=64,
-        top_k: int=50,
-        top_p: float=1.,
-        temp: float=1.0,
-        repeat_penalty: float=1,
-        init_break: bool=True,
-        instruct_inp_prefix: str="\n\n### Instruction:\n\n",
-        instruct_inp_suffix: str="\n\n### Response:\n\n",
-    ) -> None:
+    def __init__(self, params: GptParams) -> None:
         # input args
-        self.instruct = instruct
-        self.n_threads = n_threads
-        self.input_echo = input_echo
-        self.n_predict = n_predict
-        self.n_keep = n_keep
-        self.n_batch = n_batch
-        self.repeat_last_n = repeat_last_n
-        self.top_k=top_k
-        self.top_p=top_p
-        self.temp=temp
-        self.repeat_penalty=repeat_penalty
-        self.init_break = init_break
+        self.params = params
+
+        if (self.params.perplexity):
+            raise NotImplementedError("""************
+please use the 'perplexity' tool for perplexity calculations
+************""")
+
+        if (self.params.embedding):
+            raise NotImplementedError("""************
+please use the 'embedding' tool for embedding calculations
+************""")
+
+        if (self.params.n_ctx > 2048):
+            print(f"""warning: model does not support \
+context sizes greater than 2048 tokens ({self.params.n_ctx} \
+specified) expect poor results""", file=sys.stderr)
+
+        if (self.params.seed <= 0):
+            self.params.seed = int(time())
+
+        print(f"seed = {self.params.seed}", file=sys.stderr)
+
+        if (self.params.random_prompt):
+            self.params.prompt = gpt_random_prompt(self.params.seed)
 
         # runtime args
         self.input_consumed = 0
         self.embd = []
-        self.embd_inp = []
         self.n_past = 0
         self.first_antiprompt = []
-        self.remaining_tokens = self.n_predict
-        self.output_echo = input_echo
+        self.remaining_tokens = self.params.n_predict
+        self.output_echo = self.params.input_echo
 
         # model load
         self.lparams = llama_cpp.llama_context_default_params()
-        self.lparams.n_ctx = n_ctx
-        self.lparams.seed = seed
-        self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams)
+        self.lparams.n_ctx = self.params.n_ctx
+        self.lparams.n_parts = self.params.n_parts
+        self.lparams.seed = self.params.seed
+        self.lparams.memory_f16 = self.params.memory_f16
+        self.lparams.use_mlock = self.params.use_mlock
+
+        self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
+        if (self.ctx == 0):
+            raise RuntimeError(f"error: failed to load model '{self.params.model}'")
+
+        print(file=sys.stderr)
+        print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
+| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
 
         # determine the required inference memory per token:
-        tmp = [0, 1, 2, 3]
-        llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
-
-        # determine newline token
-        self.llama_token_newline = self._tokenize("\n", False)
-        self.inp_prefix = self._tokenize(instruct_inp_prefix)
-        self.inp_suffix = self._tokenize(instruct_inp_suffix, False)
-
-        # add instruction as antiprompt
-        if (self.instruct):
-            self.first_antiprompt.append(self._tokenize(instruct_inp_prefix.strip(), False))
-
-        # primer feed
-        if (len(primer) > 0):
-            self.embd_inp += self._tokenize(primer)
-
-        # number of tokens to keep when resetting context
-        if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct):
-            self.n_keep = len(self.embd_inp)
+        if (self.params.mem_test):
+            tmp = [0, 1, 2, 3]
+            llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
+            llama_cpp.llama_print_timings(self.ctx)
+            self.exit()
+            return
 
         # create internal context
         self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
-        self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
+
+        # Add a space in front of the first character to match OG llama tokenizer behavior
+        self.params.prompt = " " + self.params.prompt
+
+        # tokenize the prompt
+        self.embd_inp = self._tokenize(self.params.prompt)
+
+        if (len(self.embd_inp) > self.params.n_ctx - 4):
+            raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
+
+        # number of tokens to keep when resetting context
+        if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
+            self.params.n_keep = len(self.embd_inp)
+
+        self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix)
+        self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
+
+        # in instruct mode, we inject a prefix and a suffix to each input by the user
+        if (self.params.instruct):
+            self.params.interactive_start = True
+            self.first_antiprompt.append(self._tokenize(self.params.instruct_inp_prefix.strip(), False))
+
+        # enable interactive mode if reverse prompt or interactive start is specified
+        if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
+            self.params.interactive = True
+
+        # determine newline token
+        self.llama_token_newline = self._tokenize("\n", False)
+
+        if (self.params.verbose_prompt):
+            print(f"""
+prompt: '{self.params.prompt}'
+number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
+
+            for i in range(len(self.embd_inp)):
+                print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr)
+
+            if (self.params.n_keep > 0):
+                print("static prompt based on n_keep: '")
+                for i in range(self.params.n_keep):
+                    print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr)
+                print("'", file=sys.stderr)
+            print(file=sys.stderr)
+
+        if (self.params.interactive):
+            print("interactive mode on.", file=sys.stderr)
+
+            if (len(self.params.antiprompt) > 0):
+                for antiprompt in self.params.antiprompt:
+                    print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr)
+
+            if len(self.params.input_prefix) > 0:
+                print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
+
+        print(f"""sampling: temp = {self.params.temp},\
+top_k = {self.params.top_k},\
+top_p = {self.params.top_p},\
+repeat_last_n = {self.params.repeat_last_n},\
+repeat_penalty = {self.params.repeat_penalty}
+
+generate: n_ctx = {self.n_ctx}, \
+n_batch = {self.params.n_batch}, \
+n_predict = {self.params.n_predict}, \
+n_keep = {self.params.n_keep}
+""", file=sys.stderr)
 
         # determine antiprompt tokens
-        for i in antiprompt:
+        for i in self.params.antiprompt:
             self.first_antiprompt.append(self._tokenize(i, False))
 
+        self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
+
+        if (params.interactive):
+            print("""== Running in interactive mode. ==
+ - Press Ctrl+C to interject at any time.
+ - Press Return to return control to LLaMa.
+ - If you want to submit another line, end your input in '\\'.
+
+""", file=sys.stderr)
+        self.set_color(CONSOLE_COLOR_PROMPT)
+
     # tokenize a prompt
     def _tokenize(self, prompt, bos=True):
         _arr = (llama_cpp.llama_token * (len(prompt) + 1))()
         _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
         return _arr[:_n]
 
-    # if an antiprompt is present
     def use_antiprompt(self):
         return len(self.first_antiprompt) > 0
 
+    def set_color(self, c):
+        if (self.params.use_color):
+            print(c, end="")
+
     # generate tokens
     def generate(self):
-        while self.remaining_tokens > 0 or self.use_antiprompt():
+        while self.remaining_tokens > 0 or self.params.interactive:
             # predict
             if len(self.embd) > 0:
                 # infinite text generation via context swapping
@@ -115,8 +195,8 @@ class LLaMAInteract:
                 # - take the n_keep first tokens from the original prompt (via n_past)
                 # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
                 if (self.n_past + len(self.embd) > self.n_ctx):
-                    n_left = self.n_past - self.n_keep
-                    self.n_past = self.n_keep
+                    n_left = self.n_past - self.params.n_keep
+                    self.n_past = self.params.n_keep
 
                     # insert n_left/2 tokens at the start of embd from last_n_tokens
                     _insert = self.last_n_tokens[
@@ -125,7 +205,7 @@ class LLaMAInteract:
                     self.embd = _insert + self.embd
 
                 if (llama_cpp.llama_eval(
-                    self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads
+                    self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
                 ) != 0):
                     raise Exception("Failed to llama_eval!")
 
@@ -133,24 +213,28 @@ class LLaMAInteract:
             self.embd = []
             if len(self.embd_inp) <= self.input_consumed:
                 # out of user input, sample next token
-                _arr = self.last_n_tokens[-min(self.repeat_last_n, self.n_past):]
+
+                #TODO: self.params.ignore_eos
+
+                _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):]
                 id = llama_cpp.llama_sample_top_p_top_k(
                     self.ctx,
                     (llama_cpp.llama_token * len(_arr))(*_arr),
                     len(_arr),
-                    self.top_k,
-                    self.top_p,
-                    self.temp,
-                    self.repeat_penalty,
+                    self.params.top_k,
+                    self.params.top_p,
+                    self.params.temp,
+                    self.params.repeat_penalty,
                 )
                 self.last_n_tokens.pop(0)
                 self.last_n_tokens.append(id)
 
                 # replace end of text token with newline token when in interactive mode
-                if (id == llama_cpp.llama_token_eos() and self.use_antiprompt() and not self.instruct):
+                if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
                     id = self.llama_token_newline[0]
-                    # tokenize and inject first reverse prompt
-                    self.embd_inp += self.first_antiprompt[0]
+                    if (self.use_antiprompt()):
+                        # tokenize and inject first reverse prompt
+                        self.embd_inp += self.first_antiprompt[0]
 
                 # add it to the context
                 self.embd.append(id)
@@ -162,7 +246,7 @@ class LLaMAInteract:
                 self.remaining_tokens -= 1
             else:
                 # output to console if input echo is on
-                self.output_echo = self.input_echo
+                self.output_echo = self.params.input_echo
 
                 # some user input remains from prompt or interaction, forward it to processing
                 while len(self.embd_inp) > self.input_consumed:
@@ -170,7 +254,7 @@ class LLaMAInteract:
                     self.last_n_tokens.pop(0)
                     self.last_n_tokens.append(self.embd_inp[self.input_consumed])
                     self.input_consumed += 1
-                    if len(self.embd) >= self.n_batch:
+                    if len(self.embd) >= self.params.n_batch:
                         break
 
             # display tokens
@@ -178,7 +262,11 @@ class LLaMAInteract:
                 for id in self.embd:
                     yield id
 
-            if (len(self.embd_inp) <= self.input_consumed):
+            # reset color to default if we there is no pending user input
+            if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
+                self.set_color(CONSOLE_COLOR_DEFAULT)
+
+            if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
                 # if antiprompt is present, stop
                 if (self.use_antiprompt()):
                     if True in [
@@ -188,26 +276,36 @@ class LLaMAInteract:
                         break
 
                 # if we are using instruction mode, and we have processed the initial prompt
-                if (self.init_break):
+                if (self.n_past > 0 and self.params.interactive_start):
                     break
 
-            # if end of generation
+            # end of text token
             if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
+                if (not self.params.instruct):
+                    for i in " [end of text]\n":
+                        yield i
                 break
 
             # respect n_predict even if antiprompt is present
-            if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1):
-                if not self.instruct:
+            if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
+                # If we arent in instruction mode, fix the current generation by appending the antiprompt.
+                # Makes it so if chat ends prematurely you dont append the AI's text etc.
+                if not self.params.instruct:
                     self.embd_inp += self.first_antiprompt[0]
+                self.n_remain = self.params.n_predict
                 break
 
-        self.init_break = False
+        self.params.interactive_start = False
 
     def __enter__(self):
         return self
 
     def __exit__(self, type, value, tb):
+        self.exit()
+
+    def exit(self):
         llama_cpp.llama_free(self.ctx)
+        self.set_color(CONSOLE_COLOR_DEFAULT)
 
     # return past text
     def past(self):
@@ -216,18 +314,51 @@ class LLaMAInteract:
 
     # write input
     def input(self, prompt: str):
-        if (self.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
+        if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
             self.embd_inp += self.inp_prefix
         self.embd_inp += self._tokenize(prompt)
-        if (self.instruct):
+        if (self.params.instruct):
             self.embd_inp += self.inp_suffix
 
     # write output
     def output(self):
-        self.remaining_tokens = self.n_predict
+        self.remaining_tokens = self.params.n_predict
         for id in self.generate():
             yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
 
+    # read user input
+    def read_input(self):
+        out = ""
+        while (t := input()).endswith("\\"):
+            out += t[:-1] + "\n"
+        return out + t + "\n"
+
+    # interactive mode
+    def interact(self):
+        for i in self.output():
+            print(i,end="",flush=True)
+        self.params.input_echo = False
+
+        while self.params.interactive:
+            self.set_color(CONSOLE_COLOR_USER_INPUT)
+            if (self.params.instruct):
+                print('\n> ', end="")
+                self.input(self.read_input())
+            else:
+                print(self.params.input_prefix, end="")
+                self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}")
+                print(self.params.output_postfix,end="")
+            self.set_color(CONSOLE_COLOR_DEFAULT)
+
+            try:
+                for i in self.output():
+                    print(i,end="",flush=True)
+            except KeyboardInterrupt:
+                self.set_color(CONSOLE_COLOR_DEFAULT)
+                if not self.params.instruct:
+                    print(self.params.fix_prefix,end="")
+                    self.input(self.params.fix_prefix)
+
 if __name__ == "__main__":
     from datetime import datetime
 
@@ -252,41 +383,12 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
 {USER_NAME}:"""
+    args = gpt_params_parse()
+    params = GptParams(args)
 
-    print("Loading model...")
-    with LLaMAInteract(prompt,
-        model="./models/30B/ggml-model-q4_0.bin",
-        n_ctx=2048,
-        antiprompt=[f"\n{USER_NAME}:"],
-        repeat_last_n=256,
-        n_predict=2048,
-        temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647
-    ) as m:
-        print("Loaded model!")
+    if (args.file):
+        with open(args.file) as f:
+            params.prompt = f.read()
 
-        for i in m.output():
-            print(i,end="",flush=True)
-        m.input_echo = False
-
-        def inp():
-            out = ""
-            while (t := input()).endswith("\\"):
-                out += t[:-1] + "\n"
-            return out + t + "\n"
-
-        while True:
-            if (m.instruct):
-                print('\n> ', end="")
-                m.input(inp())
-            else:
-                print(f" ", end="")
-                m.input(f" {inp()}{AI_NAME}:")
-                print(f"{AI_NAME}: ",end="")
-
-            try:
-                for i in m.output():
-                    print(i,end="",flush=True)
-            except KeyboardInterrupt:
-                if not m.instruct:
-                    print(f"\n{USER_NAME}:",end="")
-                    m.input(f"\n{USER_NAME}:")
+    with LLaMAInteract() as m:
+        m.interact()

From d5680144c52787e2aded7decefd2370063a8dfcb Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 15:05:33 -0400
Subject: [PATCH 22/77] Bugfix: Wrong size of embeddings. Closes #47

---
 examples/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index fe9a8934b..5f22f6b50 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -229,7 +229,7 @@ _lib.llama_n_ctx.restype = c_int
 
 
 def llama_n_embd(ctx: llama_context_p) -> c_int:
-    return _lib.llama_n_ctx(ctx)
+    return _lib.llama_n_embd(ctx)
 
 
 _lib.llama_n_embd.argtypes = [llama_context_p]

From e19909249dc87566464e7468443bd2b90e22f8b3 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Fri, 7 Apr 2023 13:32:19 +0200
Subject: [PATCH 23/77] More interoperability to the original llama.cpp, and
 arguments now work

---
 examples/common.py                 | 79 +++++++++++++++++-------------
 examples/low_level_api_chat_cpp.py | 19 ++++---
 2 files changed, 55 insertions(+), 43 deletions(-)

diff --git a/examples/common.py b/examples/common.py
index f80d995c5..1758a2d1d 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -26,9 +26,6 @@ class GptParams:
     model: str = "./models/llama-7B/ggml-model.bin"
     prompt: str = ""
     input_prefix: str = " "
-    fix_prefix: str = ""
-    output_postfix: str = ""
-    input_echo: bool = True,
 
     antiprompt: List[str] = field(default_factory=list)
 
@@ -47,41 +44,57 @@ class GptParams:
     mem_test: bool = False
     verbose_prompt: bool = False
 
+    file: str = None
+
+    # If chat ended prematurely, append this to the conversation to fix it.
+    # Set to "\nUser:" etc.
+    # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
+    fix_prefix: str = " "
+    output_postfix: str = ""
+    input_echo: bool = True,
+
     # Default instructions for Alpaca
     # switch to "Human" and "Assistant" for Vicuna.
-    instruct_inp_prefix: str="\n\n### Instruction:\n\n",
-    instruct_inp_suffix: str="\n\n### Response:\n\n",
+    # TODO: TBD how they are gonna handle this upstream
+    instruct_inp_prefix: str="\n\n### Instruction:\n\n"
+    instruct_inp_suffix: str="\n\n### Response:\n\n"
 
 
 def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
     if params is None:
         params = GptParams()
 
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-h", "--help", action="store_true", help="show this help message and exit")
-    parser.add_argument("-s", "--seed", type=int, default=-1, help="",dest="seed")
-    parser.add_argument("-t", "--threads", type=int, default=1, help="",dest="n_threads")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="",dest="prompt")
-    parser.add_argument("-f", "--file", type=str, default=None, help="")
-    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="",dest="n_ctx")
-    parser.add_argument("--memory_f32", action="store_false", help="",dest="memory_f16")
-    parser.add_argument("--top_p", type=float, default=0.9, help="",dest="top_p")
-    parser.add_argument("--temp", type=float, default=1.0, help="",dest="temp")
-    parser.add_argument("--repeat_last_n", type=int, default=64, help="",dest="repeat_last_n")
-    parser.add_argument("--repeat_penalty", type=float, default=1.0, help="",dest="repeat_penalty")
-    parser.add_argument("-b", "--batch_size", type=int, default=8, help="",dest="n_batch")
-    parser.add_argument("--keep", type=int, default=0, help="",dest="n_keep")
-    parser.add_argument("-m", "--model", type=str, help="",dest="model")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
+    parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
+    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
+    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
+    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
+    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
+    parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
+    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
+    parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
+    parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
     parser.add_argument(
         "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
     )
     parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
-    parser.add_argument("--interactive-start", action="store_true", help="", dest="interactive_start")
+    parser.add_argument(
+        "--interactive-start",
+        action="store_true",
+        help="run in interactive mode",
+        dest="interactive"
+    )
     parser.add_argument(
         "--interactive-first",
         action="store_true",
         help="run in interactive mode and wait for input right away",
-        dest="interactive"
+        dest="interactive_start"
     )
     parser.add_argument(
         "-ins",
@@ -96,24 +109,24 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
         help="colorise output to distinguish prompt and user input from generations",
         dest="use_color"
     )
-    parser.add_argument("--mlock", action="store_true",dest="use_mlock")
-    parser.add_argument("--mtest", action="store_true",dest="mem_test")
+    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
     parser.add_argument(
         "-r",
         "--reverse-prompt",
         type=str,
         action='append',
-        help="run in interactive mode and poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
+        help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
         dest="antiprompt"
     )
-    parser.add_argument("--perplexity", action="store_true", help="", dest="perplexity")
-    parser.add_argument("--ignore-eos", action="store_true", help="", dest="ignore_eos")
-    parser.add_argument("--n_parts", type=int, default=-1, help="", dest="n_parts")
-    parser.add_argument("--random-prompt", action="store_true", help="", dest="random_prompt")
-    parser.add_argument("--in-prefix", type=str, default=" ", help="", dest="input_prefix")
-    parser.add_argument("--fix-prefix", type=str, default=" ", help="", dest="fix_prefix")
-    parser.add_argument("--out-postfix", type=str, default="", help="", dest="output_postfix")
-    parser.add_argument("--input-noecho", action="store_false", help="", dest="input_echo")
+    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
+    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
+    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
+    parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
+    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
+    parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
+    parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
     args = parser.parse_args(argv)
     return args
 
diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index e7370c01f..4badc6721 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -6,8 +6,6 @@ Quirks:
  * The first antiprompt should be the userprompt like "\nUser:",
    because its added when n_predict is reached (aka generation ended prematurely)
  * n_predict can be set to -1 for unlimited length responses (or just a really high value)
- * It's always in interactive mode, generation ends either by reaching an antiprompt
-   or running out of n_predict.
  * Instruction mode adds its own antiprompt.
    You should also still be feeding the model with a "primer" prompt that
    shows it the expected format.
@@ -59,7 +57,6 @@ specified) expect poor results""", file=sys.stderr)
 
         # runtime args
         self.input_consumed = 0
-        self.embd = []
         self.n_past = 0
         self.first_antiprompt = []
         self.remaining_tokens = self.params.n_predict
@@ -74,7 +71,7 @@ specified) expect poor results""", file=sys.stderr)
         self.lparams.use_mlock = self.params.use_mlock
 
         self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
-        if (self.ctx == 0):
+        if (not self.ctx):
             raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
         print(file=sys.stderr)
@@ -95,7 +92,13 @@ specified) expect poor results""", file=sys.stderr)
         # Add a space in front of the first character to match OG llama tokenizer behavior
         self.params.prompt = " " + self.params.prompt
 
+        # Load prompt file
+        if (self.params.file):
+            with open(self.params.file) as f:
+                self.params.prompt = f.read()
+
         # tokenize the prompt
+        self.embd = []
         self.embd_inp = self._tokenize(self.params.prompt)
 
         if (len(self.embd_inp) > self.params.n_ctx - 4):
@@ -384,11 +387,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {AI_NAME}: Blue
 {USER_NAME}:"""
     args = gpt_params_parse()
-    params = GptParams(args)
+    params = GptParams(**vars(args))
 
-    if (args.file):
-        with open(args.file) as f:
-            params.prompt = f.read()
-
-    with LLaMAInteract() as m:
+    with LLaMAInteract(params) as m:
         m.interact()

From f25a81309e3aa9618d79efc53d436ba0a25a8000 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:45:55 -0400
Subject: [PATCH 24/77] Update model paths to be more clear they should point
 to file

---
 examples/low_level_api_llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py
index 2a639aad5..b048c0ac8 100644
--- a/examples/low_level_api_llama_cpp.py
+++ b/examples/low_level_api_llama_cpp.py
@@ -9,7 +9,7 @@ N_THREADS = multiprocessing.cpu_count()
 prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
 
 lparams = llama_cpp.llama_context_default_params()
-ctx = llama_cpp.llama_init_from_file(b"models/ggml-alpaca-7b-q4.bin", lparams)
+ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams)
 
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]

From b36c04c99e9a7885d9ecba25f7a00c8993b6d3cb Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 16:35:38 +0200
Subject: [PATCH 25/77] Added iterative search to prevent instructions from
 being echoed, add ignore eos, add no-mmap, fixed 1 character echo too much
 bug

---
 examples/common.py                 |  3 +++
 examples/low_level_api_chat_cpp.py | 36 ++++++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/examples/common.py b/examples/common.py
index 1758a2d1d..f16980ccb 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -40,6 +40,7 @@ class GptParams:
     instruct: bool = False
     ignore_eos: bool = False
     perplexity: bool = False
+    use_mmap: bool = True
     use_mlock: bool = False
     mem_test: bool = False
     verbose_prompt: bool = False
@@ -110,7 +111,9 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
         dest="use_color"
     )
     parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
+    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
     parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
+    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
     parser.add_argument(
         "-r",
         "--reverse-prompt",
diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 4badc6721..cf4c99d6e 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -26,6 +26,25 @@ CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
 CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
 CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
 
+# Iterative search
+# Actively searches and prevents a pattern from being returned
+class IterSearch:
+    def __init__(self, pattern):
+        self.pattern = list(pattern)
+        self.buffer = []
+
+    def __call__(self, char):
+        self.buffer += [char]
+
+        if (self.pattern[:len(self.buffer)] == self.buffer):
+            if (len(self.buffer) >= len(self.pattern)):
+                self.buffer.clear()
+            return []
+
+        _tmp = self.buffer[:]
+        self.buffer.clear()
+        return _tmp
+
 # A LLaMA interactive session
 class LLaMAInteract:
     def __init__(self, params: GptParams) -> None:
@@ -69,6 +88,7 @@ specified) expect poor results""", file=sys.stderr)
         self.lparams.seed = self.params.seed
         self.lparams.memory_f16 = self.params.memory_f16
         self.lparams.use_mlock = self.params.use_mlock
+        self.lparams.use_mmap = self.params.use_mmap
 
         self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
         if (not self.ctx):
@@ -114,7 +134,9 @@ specified) expect poor results""", file=sys.stderr)
         # in instruct mode, we inject a prefix and a suffix to each input by the user
         if (self.params.instruct):
             self.params.interactive_start = True
-            self.first_antiprompt.append(self._tokenize(self.params.instruct_inp_prefix.strip(), False))
+            _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
+            self.first_antiprompt.append(_ptn)
+            self.antiecho = IterSearch(_ptn)
 
         # enable interactive mode if reverse prompt or interactive start is specified
         if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
@@ -217,7 +239,9 @@ n_keep = {self.params.n_keep}
             if len(self.embd_inp) <= self.input_consumed:
                 # out of user input, sample next token
 
-                #TODO: self.params.ignore_eos
+                if (self.params.ignore_eos):
+                    logits = llama_cpp.llama_get_logits(self.ctx)
+                    logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0)
 
                 _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):]
                 id = llama_cpp.llama_sample_top_p_top_k(
@@ -263,7 +287,11 @@ n_keep = {self.params.n_keep}
             # display tokens
             if self.output_echo:
                 for id in self.embd:
-                    yield id
+                    if self.params.instruct:
+                        for r in self.antiecho(id):
+                            yield r
+                    else:
+                        yield id
 
             # reset color to default if we there is no pending user input
             if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
@@ -279,7 +307,7 @@ n_keep = {self.params.n_keep}
                         break
 
                 # if we are using instruction mode, and we have processed the initial prompt
-                if (self.n_past > 0 and self.params.interactive_start):
+                if (self.params.interactive_start):
                     break
 
             # end of text token

From d1b35174773896cb3452f7875f7cacaddd486bf9 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Wed, 5 Apr 2023 14:23:01 +0200
Subject: [PATCH 26/77] Allow local llama library usage

---
 examples/llama_cpp.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 5f22f6b50..8bc0b577b 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -29,9 +29,12 @@ def _load_shared_library(lib_base_name):
 
     # Construct the paths to the possible shared library names
     _base_path = pathlib.Path(__file__).parent.resolve()
+    _local_path = pathlib.Path.cwd()
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths = [
+        _local_path / f"./lib{lib_base_name}{lib_ext}",
+        _local_path / f"./{lib_base_name}{lib_ext}",
         _base_path / f"lib{lib_base_name}{lib_ext}",
         _base_path / f"{lib_base_name}{lib_ext}"
     ]

From c8b5d0b963c7339d9b3fa98ebc1a5a7b542a2ea7 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 17:00:35 +0200
Subject: [PATCH 27/77] Use environment variable for library override

---
 examples/llama_cpp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 8bc0b577b..63e8e97bf 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -29,16 +29,16 @@ def _load_shared_library(lib_base_name):
 
     # Construct the paths to the possible shared library names
     _base_path = pathlib.Path(__file__).parent.resolve()
-    _local_path = pathlib.Path.cwd()
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths = [
-        _local_path / f"./lib{lib_base_name}{lib_ext}",
-        _local_path / f"./{lib_base_name}{lib_ext}",
         _base_path / f"lib{lib_base_name}{lib_ext}",
         _base_path / f"{lib_base_name}{lib_ext}"
     ]
 
+    if ("LLAMA_CPP_LIB" in os.environ):
+        _lib_paths = [pathlib.Path(os.environ["LLAMA_CPP_LIB"]).resolve()]
+
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(_base_path))

From 848b4021a35d6c5f2ef6e15b771d004687f1779a Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 17:06:58 +0200
Subject: [PATCH 28/77] Better custom library debugging

---
 examples/llama_cpp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 63e8e97bf..89eca4bb7 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -37,6 +37,7 @@ def _load_shared_library(lib_base_name):
     ]
 
     if ("LLAMA_CPP_LIB" in os.environ):
+        lib_base_name = os.environ["LLAMA_CPP_LIB"]
         _lib_paths = [pathlib.Path(os.environ["LLAMA_CPP_LIB"]).resolve()]
 
     # Add the library directory to the DLL search path on Windows (if needed)

From d0a7ce9abf690fe6f6ff77d31de8d4942f840787 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 17:12:25 +0200
Subject: [PATCH 29/77] Make windows users happy (hopefully)

---
 examples/llama_cpp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 89eca4bb7..1611e1635 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -38,7 +38,9 @@ def _load_shared_library(lib_base_name):
 
     if ("LLAMA_CPP_LIB" in os.environ):
         lib_base_name = os.environ["LLAMA_CPP_LIB"]
-        _lib_paths = [pathlib.Path(os.environ["LLAMA_CPP_LIB"]).resolve()]
+        _lib = pathlib.Path(lib_base_name)
+        _base_path = _lib.parent.resolve()
+        _lib_paths = [_lib.resolve()]
 
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):

From ce0ca60b5676e2dfdb892299b05d849a164b96c1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:01:33 -0400
Subject: [PATCH 30/77] Update llama.cpp (llama_mmap_supported)

---
 examples/llama_cpp.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 1611e1635..fa59f1a60 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -94,6 +94,7 @@ class llama_context_params(Structure):
             c_bool,
         ),  # the llama_eval() call computes all logits, not just the last one
         ("vocab_only", c_bool),  # only load the vocabulary, no weights
+        ("use_mmap", c_bool),  # use mmap if possible
         ("use_mlock", c_bool),  # force system to keep model in RAM
         ("embedding", c_bool),  # embedding mode only
         # called with a progress value between 0 and 1, pass NULL to disable
@@ -116,6 +117,17 @@ def llama_context_default_params() -> llama_context_params:
 _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
+def llama_mmap_supported() -> c_bool:
+    return _lib.llama_mmap_supported()
+
+_lib.llama_mmap_supported.argtypes = []
+_lib.llama_mmap_supported.restype = c_bool
+
+def llama_mlock_supported() -> c_bool:
+    return _lib.llama_mlock_supported()
+
+_lib.llama_mlock_supported.argtypes = []
+_lib.llama_mlock_supported.restype = c_bool
 
 # Various functions for loading a ggml llama model.
 # Allocate (almost) all memory needed for the model.

From d595f330e203d45c0760714ef3ea8f56f2b7304a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 11 Apr 2023 11:59:03 -0400
Subject: [PATCH 31/77] Update llama.cpp

---
 examples/llama_cpp.py | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index fa59f1a60..c4df029c9 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -11,10 +11,11 @@ from ctypes import (
     Structure,
     Array,
     c_uint8,
-    c_size_t
+    c_size_t,
 )
 import pathlib
 
+
 # Load the library
 def _load_shared_library(lib_base_name):
     # Determine the file extension based on the platform
@@ -33,10 +34,10 @@ def _load_shared_library(lib_base_name):
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths = [
         _base_path / f"lib{lib_base_name}{lib_ext}",
-        _base_path / f"{lib_base_name}{lib_ext}"
+        _base_path / f"{lib_base_name}{lib_ext}",
     ]
 
-    if ("LLAMA_CPP_LIB" in os.environ):
+    if "LLAMA_CPP_LIB" in os.environ:
         lib_base_name = os.environ["LLAMA_CPP_LIB"]
         _lib = pathlib.Path(lib_base_name)
         _base_path = _lib.parent.resolve()
@@ -54,7 +55,10 @@ def _load_shared_library(lib_base_name):
             except Exception as e:
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
-    raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found")
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
 
 # Specify the base name of the shared library to load
 _lib_base_name = "llama"
@@ -106,6 +110,10 @@ class llama_context_params(Structure):
 
 llama_context_params_p = POINTER(llama_context_params)
 
+LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
+LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
 
 # Functions
 
@@ -117,18 +125,23 @@ def llama_context_default_params() -> llama_context_params:
 _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
+
 def llama_mmap_supported() -> c_bool:
     return _lib.llama_mmap_supported()
 
+
 _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool
 
+
 def llama_mlock_supported() -> c_bool:
     return _lib.llama_mlock_supported()
 
+
 _lib.llama_mlock_supported.argtypes = []
 _lib.llama_mlock_supported.restype = c_bool
 
+
 # Various functions for loading a ggml llama model.
 # Allocate (almost) all memory needed for the model.
 # Return NULL on failure
@@ -162,33 +175,42 @@ def llama_model_quantize(
 _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
 _lib.llama_model_quantize.restype = c_int
 
+
 # Returns the KV cache that will contain the context for the
 # ongoing prediction with the model.
 def llama_get_kv_cache(ctx: llama_context_p):
     return _lib.llama_get_kv_cache(ctx)
 
+
 _lib.llama_get_kv_cache.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache.restype = POINTER(c_uint8)
 
+
 # Returns the size of the KV cache
 def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
     return _lib.llama_get_kv_cache_size(ctx)
 
+
 _lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache_size.restype = c_size_t
 
+
 # Returns the number of tokens in the KV cache
 def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
     return _lib.llama_get_kv_cache_token_count(ctx)
 
+
 _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache_token_count.restype = c_int
 
 
 # Sets the KV cache containing the current context for the model
-def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
+def llama_set_kv_cache(
+    ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int
+):
     return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
 
+
 _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
 _lib.llama_set_kv_cache.restype = None
 

From 3693449c079e8875934d3c57f1fbed744773b6f3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 14:29:00 -0400
Subject: [PATCH 32/77] Update llama.cpp

---
 examples/llama_cpp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index c4df029c9..935017ab1 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -114,6 +114,7 @@ LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
 LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4)  # tok_embeddings.weight and output.weight are F16
 
 # Functions
 

From b6ce5133d9fa3015a44dc3b78c546cb8e5a34257 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 18 Apr 2023 01:30:04 -0400
Subject: [PATCH 33/77] Add bindings for LoRA adapters. Closes #88

---
 examples/llama_cpp.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 935017ab1..c2d1ace63 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -114,7 +114,9 @@ LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
 LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4)  # tok_embeddings.weight and output.weight are F16
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
+    4
+)  # tok_embeddings.weight and output.weight are F16
 
 # Functions
 
@@ -177,6 +179,22 @@ _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
 _lib.llama_model_quantize.restype = c_int
 
 
+# Apply a LoRA adapter to a loaded model
+# path_base_model is the path to a higher quality model to use as a base for
+# the layers modified by the adapter. Can be NULL to use the current loaded model.
+# The model needs to be reloaded before applying a new adapter, otherwise the adapter
+# will be applied on top of the previous one
+# Returns 0 on success
+def llama_apply_lora_from_file(
+    ctx: llama_context_p, path_lora: bytes, path_base_model: bytes, n_threads: c_int
+) -> c_int:
+    return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
+
+
+_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
+_lib.llama_apply_lora_from_file.restype = c_int
+
+
 # Returns the KV cache that will contain the context for the
 # ongoing prediction with the model.
 def llama_get_kv_cache(ctx: llama_context_p):

From 8229410a4eae9996a8b4fced88d8aefbe002cf4a Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 16:38:45 +0200
Subject: [PATCH 34/77] More reasonable defaults

---
 examples/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/common.py b/examples/common.py
index f16980ccb..58a5688ba 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -50,7 +50,7 @@ class GptParams:
     # If chat ended prematurely, append this to the conversation to fix it.
     # Set to "\nUser:" etc.
     # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
-    fix_prefix: str = " "
+    fix_prefix: str = ""
     output_postfix: str = ""
     input_echo: bool = True,
 

From 81c4c10389a814598ba4fd2dbaadb032550e514f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 18 Apr 2023 23:44:46 -0400
Subject: [PATCH 35/77] Update type signature to allow for null pointer to be
 passed.

---
 examples/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index c2d1ace63..5e8a5c316 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -186,7 +186,7 @@ _lib.llama_model_quantize.restype = c_int
 # will be applied on top of the previous one
 # Returns 0 on success
 def llama_apply_lora_from_file(
-    ctx: llama_context_p, path_lora: bytes, path_base_model: bytes, n_threads: c_int
+    ctx: llama_context_p, path_lora: ctypes.c_char_p, path_base_model: ctypes.c_char_p, n_threads: c_int
 ) -> c_int:
     return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
 

From bdbaf5dc76ef3c793b8206b3b2cc1ae0bf671513 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 17 Apr 2023 14:45:28 +0200
Subject: [PATCH 36/77] Fixed end of text wrong type, and fix n_predict
 behaviour

---
 examples/common.py                 |  2 +-
 examples/low_level_api_chat_cpp.py | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/common.py b/examples/common.py
index 58a5688ba..061ec3ae9 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -75,7 +75,7 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
     parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
     parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
     parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
-    parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict")
+    parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
     parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
     parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
     parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index cf4c99d6e..4a87d7d6b 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -144,6 +144,7 @@ specified) expect poor results""", file=sys.stderr)
 
         # determine newline token
         self.llama_token_newline = self._tokenize("\n", False)
+        self.llama_token_eot = self._tokenize(" [end of text]\n", False)
 
         if (self.params.verbose_prompt):
             print(f"""
@@ -203,16 +204,16 @@ n_keep = {self.params.n_keep}
         _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
         return _arr[:_n]
 
-    def use_antiprompt(self):
-        return len(self.first_antiprompt) > 0
-
     def set_color(self, c):
         if (self.params.use_color):
             print(c, end="")
 
+    def use_antiprompt(self):
+        return len(self.first_antiprompt) > 0
+
     # generate tokens
     def generate(self):
-        while self.remaining_tokens > 0 or self.params.interactive:
+        while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1:
             # predict
             if len(self.embd) > 0:
                 # infinite text generation via context swapping
@@ -313,7 +314,7 @@ n_keep = {self.params.n_keep}
             # end of text token
             if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
                 if (not self.params.instruct):
-                    for i in " [end of text]\n":
+                    for i in self.llama_token_eot:
                         yield i
                 break
 

From fd64310276801578e7bfc848664a3b4405e58674 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Wed, 26 Apr 2023 14:37:06 +0200
Subject: [PATCH 37/77] Fix decode errors permanently

---
 examples/low_level_api_chat_cpp.py  | 6 +++---
 examples/low_level_api_llama_cpp.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 4a87d7d6b..7a932a36f 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -96,7 +96,7 @@ specified) expect poor results""", file=sys.stderr)
 
         print(file=sys.stderr)
         print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
-| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
+| {llama_cpp.llama_print_system_info().decode('utf8', errors='ignore')}", file=sys.stderr)
 
         # determine the required inference memory per token:
         if (self.params.mem_test):
@@ -342,7 +342,7 @@ n_keep = {self.params.n_keep}
     # return past text
     def past(self):
         for id in self.last_n_tokens[-self.n_past:]:
-            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
 
     # write input
     def input(self, prompt: str):
@@ -356,7 +356,7 @@ n_keep = {self.params.n_keep}
     def output(self):
         self.remaining_tokens = self.params.n_predict
         for id in self.generate():
-            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
 
     # read user input
     def read_input(self):
diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py
index b048c0ac8..4fb5a0366 100644
--- a/examples/low_level_api_llama_cpp.py
+++ b/examples/low_level_api_llama_cpp.py
@@ -70,7 +70,7 @@ while remaining_tokens > 0:
     if not input_noecho:
         for id in embd:
             print(
-                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"),
+                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
                 end="",
                 flush=True,
             )

From 5bbf40aa47b767013c692b315ab06da6d5d88a86 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 21 Apr 2023 17:40:27 -0400
Subject: [PATCH 38/77] Update llama.cpp

---
 examples/llama_cpp.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 5e8a5c316..0005e4290 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -117,6 +117,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
+LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6)  # except 1d tensors
 
 # Functions
 
@@ -169,13 +171,14 @@ _lib.llama_free.restype = None
 
 # TODO: not great API - very likely to change
 # Returns 0 on success
+# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
 def llama_model_quantize(
-    fname_inp: bytes, fname_out: bytes, itype: c_int
+    fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
 ) -> c_int:
-    return _lib.llama_model_quantize(fname_inp, fname_out, itype)
+    return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
 
 
-_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
+_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
 _lib.llama_model_quantize.restype = c_int
 
 

From bf9f02d8eec049f7ab11d405aaac15a2df5b63d7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 22 Apr 2023 19:50:28 -0400
Subject: [PATCH 39/77] Update llama.cpp

---
 examples/llama_cpp.py | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 0005e4290..44ab04acf 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -189,7 +189,10 @@ _lib.llama_model_quantize.restype = c_int
 # will be applied on top of the previous one
 # Returns 0 on success
 def llama_apply_lora_from_file(
-    ctx: llama_context_p, path_lora: ctypes.c_char_p, path_base_model: ctypes.c_char_p, n_threads: c_int
+    ctx: llama_context_p,
+    path_lora: ctypes.c_char_p,
+    path_base_model: ctypes.c_char_p,
+    n_threads: c_int,
 ) -> c_int:
     return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
 
@@ -237,6 +240,36 @@ _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t,
 _lib.llama_set_kv_cache.restype = None
 
 
+# Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
+def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
+    return _lib.llama_get_state_size(ctx)
+
+
+_lib.llama_get_state_size.argtypes = [llama_context_p]
+_lib.llama_get_state_size.restype = c_size_t
+
+
+# Copies the state to the specified destination address.
+# Destination needs to have allocated enough memory.
+# Returns the number of bytes copied
+def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t:
+    return _lib.llama_copy_state_data(ctx, dest)
+
+
+_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)]
+_lib.llama_copy_state_data.restype = c_size_t
+
+
+# Set the state reading from the specified address
+# Returns the number of bytes read
+def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t:
+    return _lib.llama_set_state_data(ctx, src)
+
+
+_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)]
+_lib.llama_set_state_data.restype = c_size_t
+
+
 # Run the llama inference to obtain the logits and probabilities for the next token.
 # tokens + n_tokens is the provided batch of new tokens to process
 # n_past is the number of tokens to use from previous eval calls

From 80c18cb66510d659f1d6b8e499da6ede8a972f57 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 24 Apr 2023 09:30:10 -0400
Subject: [PATCH 40/77] Update llama.cpp (remove llama_get_kv_cache)

---
 examples/llama_cpp.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 44ab04acf..90f498aa5 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -201,25 +201,6 @@ _lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p,
 _lib.llama_apply_lora_from_file.restype = c_int
 
 
-# Returns the KV cache that will contain the context for the
-# ongoing prediction with the model.
-def llama_get_kv_cache(ctx: llama_context_p):
-    return _lib.llama_get_kv_cache(ctx)
-
-
-_lib.llama_get_kv_cache.argtypes = [llama_context_p]
-_lib.llama_get_kv_cache.restype = POINTER(c_uint8)
-
-
-# Returns the size of the KV cache
-def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
-    return _lib.llama_get_kv_cache_size(ctx)
-
-
-_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
-_lib.llama_get_kv_cache_size.restype = c_size_t
-
-
 # Returns the number of tokens in the KV cache
 def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
     return _lib.llama_get_kv_cache_token_count(ctx)
@@ -229,17 +210,6 @@ _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache_token_count.restype = c_int
 
 
-# Sets the KV cache containing the current context for the model
-def llama_set_kv_cache(
-    ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int
-):
-    return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
-
-
-_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
-_lib.llama_set_kv_cache.restype = None
-
-
 # Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
 def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
     return _lib.llama_get_state_size(ctx)

From 656190750d91740d468eafcbe3b53f7fd3d1c780 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 25 Apr 2023 19:03:41 -0400
Subject: [PATCH 41/77] Update llama.cpp

---
 examples/llama_cpp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 90f498aa5..7c2254015 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -119,6 +119,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
 )  # tok_embeddings.weight and output.weight are F16
 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7)  # except 1d tensors
 
 # Functions
 

From 66ad132575ebaecb58a354a8a5f23af70d1865c0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 26 Apr 2023 20:00:54 -0400
Subject: [PATCH 42/77] Update llama.cpp

---
 examples/llama_cpp.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 7c2254015..6fbd393bb 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -120,6 +120,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9)  # except 1d tensors
 
 # Functions
 
@@ -210,6 +212,12 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
 _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache_token_count.restype = c_int
 
+# Sets the current rng seed.
+def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
+    return _lib.llama_set_rng_seed(ctx, seed)
+
+_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
+_lib.llama_set_rng_seed.restype = None
 
 # Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
 def llama_get_state_size(ctx: llama_context_p) -> c_size_t:

From c8e6ac366a22c9a4c4268c4324735c267bfb6ab8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 28 Apr 2023 15:32:43 -0400
Subject: [PATCH 43/77] Update llama.cpp (llama_load_session_file)

---
 examples/llama_cpp.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 6fbd393bb..3ac6d6e29 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -212,13 +212,16 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
 _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache_token_count.restype = c_int
 
+
 # Sets the current rng seed.
 def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
     return _lib.llama_set_rng_seed(ctx, seed)
 
+
 _lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
 _lib.llama_set_rng_seed.restype = None
 
+
 # Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
 def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
     return _lib.llama_get_state_size(ctx)
@@ -249,6 +252,44 @@ _lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)]
 _lib.llama_set_state_data.restype = c_size_t
 
 
+# Save/load session file
+def llama_load_session_file(
+    ctx: llama_context_p,
+    path_session: bytes,
+    tokens_out,
+    n_token_capacity: c_size_t,
+    n_token_count_out,
+) -> c_size_t:
+    return _lib.llama_load_session_file(
+        ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
+    )
+
+
+_lib.llama_load_session_file.argtypes = [
+    llama_context_p,
+    c_char_p,
+    llama_token_p,
+    c_size_t,
+    POINTER(c_size_t),
+]
+_lib.llama_load_session_file.restype = c_size_t
+
+
+def llama_save_session_file(
+    ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t
+) -> c_size_t:
+    return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
+
+
+_lib.llama_save_session_file.argtypes = [
+    llama_context_p,
+    c_char_p,
+    llama_token_p,
+    c_size_t,
+]
+_lib.llama_save_session_file.restype = c_size_t
+
+
 # Run the llama inference to obtain the logits and probabilities for the next token.
 # tokens + n_tokens is the provided batch of new tokens to process
 # n_past is the number of tokens to use from previous eval calls

From 36b34943324da5fefa435263ad9739d9f9e78da9 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Wed, 26 Apr 2023 14:45:51 +0200
Subject: [PATCH 44/77] Also ignore errors on input prompts

---
 examples/low_level_api_chat_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 7a932a36f..2e24e8683 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -201,7 +201,7 @@ n_keep = {self.params.n_keep}
     # tokenize a prompt
     def _tokenize(self, prompt, bos=True):
         _arr = (llama_cpp.llama_token * (len(prompt) + 1))()
-        _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
+        _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
         return _arr[:_n]
 
     def set_color(self, c):

From 441d30811accb7350bd6aee81d34d7ee4c8f3899 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Fri, 28 Apr 2023 12:50:30 +0200
Subject: [PATCH 45/77] Detect multi-byte responses and wait

---
 examples/low_level_api_chat_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 2e24e8683..e046c2a79 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -96,7 +96,7 @@ specified) expect poor results""", file=sys.stderr)
 
         print(file=sys.stderr)
         print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
-| {llama_cpp.llama_print_system_info().decode('utf8', errors='ignore')}", file=sys.stderr)
+| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
 
         # determine the required inference memory per token:
         if (self.params.mem_test):

From d0031edbd2f5cb1559281465a40fe80ba04283b1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 1 May 2023 10:44:28 -0400
Subject: [PATCH 46/77] Update llama.cpp

---
 examples/llama_cpp.py | 230 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 215 insertions(+), 15 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 3ac6d6e29..3b5e66047 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -67,6 +67,12 @@ _lib_base_name = "llama"
 _lib = _load_shared_library(_lib_base_name)
 
 # C types
+LLAMA_FILE_VERSION = ctypes.c_int(1)
+LLAMA_FILE_MAGIC = b"ggjt"
+LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
+LLAMA_SESSION_MAGIC = b"ggsn"
+LLAMA_SESSION_VERSION = ctypes.c_int(0)
+
 llama_context_p = c_void_p
 
 
@@ -77,13 +83,24 @@ llama_token_p = POINTER(llama_token)
 class llama_token_data(Structure):
     _fields_ = [
         ("id", llama_token),  # token id
+        ("logit", c_float),  # log-odds of the token
         ("p", c_float),  # probability of the token
-        ("plog", c_float),  # log probability of the token
     ]
 
 
 llama_token_data_p = POINTER(llama_token_data)
 
+
+class llama_token_data_array(Structure):
+    _fields_ = [
+        ("data", llama_token_data_p),
+        ("size", c_size_t),
+        ("sorted", c_bool),
+    ]
+
+
+llama_token_data_array_p = POINTER(llama_token_data_array)
+
 llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 
@@ -118,7 +135,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6)  # except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9)  # except 1d tensors
@@ -401,31 +418,214 @@ _lib.llama_token_eos.argtypes = []
 _lib.llama_token_eos.restype = llama_token
 
 
-# TODO: improve the last_n_tokens interface ?
-def llama_sample_top_p_top_k(
+def llama_token_nl() -> llama_token:
+    return _lib.llama_token_nl()
+
+
+_lib.llama_token_nl.argtypes = []
+_lib.llama_token_nl.restype = llama_token
+
+
+# Sampling functions
+def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    last_n_tokens_data,  # type: Array[llama_token]
-    last_n_tokens_size: c_int,
-    top_k: c_int,
-    top_p: c_float,
-    temp: c_float,
-    repeat_penalty: c_float,
+    candidates,
+    last_tokens_data,
+    last_tokens_size: c_int,
+    penalty: c_float,
 ) -> llama_token:
-    return _lib.llama_sample_top_p_top_k(
-        ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty
+    return _lib.llama_sample_repetition_penalty(
+        ctx, candidates, last_tokens_data, last_tokens_size, penalty
     )
 
 
-_lib.llama_sample_top_p_top_k.argtypes = [
+_lib.llama_sample_repetition_penalty.argtypes = [
     llama_context_p,
+    llama_token_data_array_p,
     llama_token_p,
     c_int,
-    c_int,
     c_float,
+]
+_lib.llama_sample_repetition_penalty.restype = llama_token
+
+
+# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+def llama_sample_frequency_and_presence_penalties(
+    ctx: llama_context_p,
+    candidates,
+    last_tokens_data,
+    last_tokens_size: c_int,
+    alpha_frequency: c_float,
+    alpha_presence: c_float,
+) -> llama_token:
+    return _lib.llama_sample_frequency_and_presence_penalties(
+        ctx,
+        candidates,
+        last_tokens_data,
+        last_tokens_size,
+        alpha_frequency,
+        alpha_presence,
+    )
+
+
+_lib.llama_sample_frequency_and_presence_penalties.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    llama_token_p,
+    c_int,
     c_float,
     c_float,
 ]
-_lib.llama_sample_top_p_top_k.restype = llama_token
+_lib.llama_sample_frequency_and_presence_penalties.restype = llama_token
+
+
+# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+def llama_sample_softmax(ctx: llama_context_p, candidates) -> llama_token:
+    return _lib.llama_sample_softmax(ctx, candidates)
+
+
+_lib.llama_sample_softmax.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+]
+_lib.llama_sample_softmax.restype = llama_token
+
+
+# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
+def llama_sample_top_k(
+    ctx: llama_context_p, candidates, k: c_int, min_keep: c_int
+) -> llama_token:
+    return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
+
+
+_lib.llama_sample_top_k.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_int,
+    c_int,
+]
+
+
+# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+def llama_sample_top_p(
+    ctx: llama_context_p, candidates, p: c_float, min_keep: c_int
+) -> llama_token:
+    return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
+
+
+_lib.llama_sample_top_p.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_int,
+]
+_lib.llama_sample_top_p.restype = llama_token
+
+
+# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+def llama_sample_tail_free(
+    ctx: llama_context_p, candidates, z: c_float, min_keep: c_int
+) -> llama_token:
+    return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
+
+
+_lib.llama_sample_tail_free.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_int,
+]
+_lib.llama_sample_tail_free.restype = llama_token
+
+
+# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+def llama_sample_typical(
+    ctx: llama_context_p, candidates, p: c_float, min_keep: c_int
+) -> llama_token:
+    return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
+
+
+_lib.llama_sample_typical.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_int,
+]
+_lib.llama_sample_typical.restype = llama_token
+
+
+# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+def llama_sample_temperature(
+    ctx: llama_context_p, candidates, temp: c_float
+) -> llama_token:
+    return _lib.llama_sample_temperature(ctx, candidates, temp)
+
+
+_lib.llama_sample_temperature.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+]
+_lib.llama_sample_temperature.restype = llama_token
+
+
+# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+def llama_sample_token_mirostat(
+    ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu
+) -> llama_token:
+    return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
+
+
+_lib.llama_sample_token_mirostat.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_float,
+    c_int,
+    POINTER(c_float),
+]
+_lib.llama_sample_token_mirostat.restype = llama_token
+
+
+# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+def llama_sample_token_mirostat_v2(
+    ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu
+) -> llama_token:
+    return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
+
+
+_lib.llama_sample_token_mirostat_v2.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_float,
+    POINTER(c_float),
+]
+_lib.llama_sample_token_mirostat_v2.restype = llama_token
+
+
+# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token:
+    return _lib.llama_sample_token_greedy(ctx, candidates)
+
+
+_lib.llama_sample_token_greedy.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+]
+_lib.llama_sample_token_greedy.restype = llama_token
+
+
+# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token:
+    return _lib.llama_sample_token(ctx, candidates)
+
+
+_lib.llama_sample_token.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+]
+_lib.llama_sample_token.restype = llama_token
 
 
 # Performance information

From 78531e5d055f24614fb5b0d1659ec935794c1765 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 1 May 2023 14:02:06 -0400
Subject: [PATCH 47/77] Fix return types and import comments

---
 examples/llama_cpp.py | 72 +++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 3b5e66047..601ffc6c2 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -427,13 +427,16 @@ _lib.llama_token_nl.restype = llama_token
 
 
 # Sampling functions
+
+
+# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
     candidates,
     last_tokens_data,
     last_tokens_size: c_int,
     penalty: c_float,
-) -> llama_token:
+):
     return _lib.llama_sample_repetition_penalty(
         ctx, candidates, last_tokens_data, last_tokens_size, penalty
     )
@@ -446,10 +449,10 @@ _lib.llama_sample_repetition_penalty.argtypes = [
     c_int,
     c_float,
 ]
-_lib.llama_sample_repetition_penalty.restype = llama_token
+_lib.llama_sample_repetition_penalty.restype = None
 
 
-# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
     candidates,
@@ -457,7 +460,7 @@ def llama_sample_frequency_and_presence_penalties(
     last_tokens_size: c_int,
     alpha_frequency: c_float,
     alpha_presence: c_float,
-) -> llama_token:
+):
     return _lib.llama_sample_frequency_and_presence_penalties(
         ctx,
         candidates,
@@ -476,11 +479,11 @@ _lib.llama_sample_frequency_and_presence_penalties.argtypes = [
     c_float,
     c_float,
 ]
-_lib.llama_sample_frequency_and_presence_penalties.restype = llama_token
+_lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
-# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
-def llama_sample_softmax(ctx: llama_context_p, candidates) -> llama_token:
+# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+def llama_sample_softmax(ctx: llama_context_p, candidates):
     return _lib.llama_sample_softmax(ctx, candidates)
 
 
@@ -488,13 +491,11 @@ _lib.llama_sample_softmax.argtypes = [
     llama_context_p,
     llama_token_data_array_p,
 ]
-_lib.llama_sample_softmax.restype = llama_token
+_lib.llama_sample_softmax.restype = None
 
 
-# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
-def llama_sample_top_k(
-    ctx: llama_context_p, candidates, k: c_int, min_keep: c_int
-) -> llama_token:
+# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+def llama_sample_top_k(ctx: llama_context_p, candidates, k: c_int, min_keep: c_int):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
 
@@ -504,12 +505,11 @@ _lib.llama_sample_top_k.argtypes = [
     c_int,
     c_int,
 ]
+_lib.llama_sample_top_k.restype = None
 
 
-# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
-def llama_sample_top_p(
-    ctx: llama_context_p, candidates, p: c_float, min_keep: c_int
-) -> llama_token:
+# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+def llama_sample_top_p(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
 
@@ -519,13 +519,13 @@ _lib.llama_sample_top_p.argtypes = [
     c_float,
     c_int,
 ]
-_lib.llama_sample_top_p.restype = llama_token
+_lib.llama_sample_top_p.restype = None
 
 
-# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
     ctx: llama_context_p, candidates, z: c_float, min_keep: c_int
-) -> llama_token:
+):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
 
@@ -535,13 +535,11 @@ _lib.llama_sample_tail_free.argtypes = [
     c_float,
     c_int,
 ]
-_lib.llama_sample_tail_free.restype = llama_token
+_lib.llama_sample_tail_free.restype = None
 
 
-# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
-def llama_sample_typical(
-    ctx: llama_context_p, candidates, p: c_float, min_keep: c_int
-) -> llama_token:
+# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+def llama_sample_typical(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
 
@@ -551,13 +549,10 @@ _lib.llama_sample_typical.argtypes = [
     c_float,
     c_int,
 ]
-_lib.llama_sample_typical.restype = llama_token
+_lib.llama_sample_typical.restype = None
 
 
-# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
-def llama_sample_temperature(
-    ctx: llama_context_p, candidates, temp: c_float
-) -> llama_token:
+def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
 
@@ -566,10 +561,15 @@ _lib.llama_sample_temperature.argtypes = [
     llama_token_data_array_p,
     c_float,
 ]
-_lib.llama_sample_temperature.restype = llama_token
+_lib.llama_sample_temperature.restype = None
 
 
-# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
     ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu
 ) -> llama_token:
@@ -587,7 +587,11 @@ _lib.llama_sample_token_mirostat.argtypes = [
 _lib.llama_sample_token_mirostat.restype = llama_token
 
 
-# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu
 ) -> llama_token:
@@ -604,7 +608,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [
 _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 
-# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+# @details Selects the token with the highest probability.
 def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
@@ -616,7 +620,7 @@ _lib.llama_sample_token_greedy.argtypes = [
 _lib.llama_sample_token_greedy.restype = llama_token
 
 
-# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+# @details Randomly selects a token from the candidates based on their probabilities.
 def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 

From c26e9bf1c1552bd076ae21bbbc1146ce7dc6d5ff Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 1 May 2023 14:47:55 -0400
Subject: [PATCH 48/77] Update sampling api

---
 examples/llama_cpp.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 601ffc6c2..4e4596ea7 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -495,7 +495,9 @@ _lib.llama_sample_softmax.restype = None
 
 
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-def llama_sample_top_k(ctx: llama_context_p, candidates, k: c_int, min_keep: c_int):
+def llama_sample_top_k(
+    ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1)
+):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
 
@@ -503,13 +505,15 @@ _lib.llama_sample_top_k.argtypes = [
     llama_context_p,
     llama_token_data_array_p,
     c_int,
-    c_int,
+    c_size_t,
 ]
 _lib.llama_sample_top_k.restype = None
 
 
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-def llama_sample_top_p(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int):
+def llama_sample_top_p(
+    ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1)
+):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
 
@@ -517,14 +521,14 @@ _lib.llama_sample_top_p.argtypes = [
     llama_context_p,
     llama_token_data_array_p,
     c_float,
-    c_int,
+    c_size_t,
 ]
 _lib.llama_sample_top_p.restype = None
 
 
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
-    ctx: llama_context_p, candidates, z: c_float, min_keep: c_int
+    ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -533,13 +537,15 @@ _lib.llama_sample_tail_free.argtypes = [
     llama_context_p,
     llama_token_data_array_p,
     c_float,
-    c_int,
+    c_size_t,
 ]
 _lib.llama_sample_tail_free.restype = None
 
 
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-def llama_sample_typical(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int):
+def llama_sample_typical(
+    ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1)
+):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
 
@@ -547,7 +553,7 @@ _lib.llama_sample_typical.argtypes = [
     llama_context_p,
     llama_token_data_array_p,
     c_float,
-    c_int,
+    c_size_t,
 ]
 _lib.llama_sample_typical.restype = None
 

From d15578e63e5648373d42f04a31ca6e37055457ea Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 3 May 2023 09:33:30 -0400
Subject: [PATCH 49/77] Update llama.cpp (session version)

---
 examples/llama_cpp.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 4e4596ea7..5baa6cc76 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -71,7 +71,7 @@ LLAMA_FILE_VERSION = ctypes.c_int(1)
 LLAMA_FILE_MAGIC = b"ggjt"
 LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
 LLAMA_SESSION_MAGIC = b"ggsn"
-LLAMA_SESSION_VERSION = ctypes.c_int(0)
+LLAMA_SESSION_VERSION = ctypes.c_int(1)
 
 llama_context_p = c_void_p
 
@@ -239,7 +239,8 @@ _lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
 _lib.llama_set_rng_seed.restype = None
 
 
-# Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
+# Returns the maximum size in bytes of the state (rng, logits, embedding
+# and kv_cache) - will often be smaller after compacting tokens
 def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
     return _lib.llama_get_state_size(ctx)
 

From 9e79465b215497409c5740f4285f1b508938ea93 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:05:31 -0400
Subject: [PATCH 50/77] Prefer explicit imports

---
 examples/llama_cpp.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 5baa6cc76..a56243dc9 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -67,11 +67,11 @@ _lib_base_name = "llama"
 _lib = _load_shared_library(_lib_base_name)
 
 # C types
-LLAMA_FILE_VERSION = ctypes.c_int(1)
+LLAMA_FILE_VERSION = c_int(1)
 LLAMA_FILE_MAGIC = b"ggjt"
 LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
 LLAMA_SESSION_MAGIC = b"ggsn"
-LLAMA_SESSION_VERSION = ctypes.c_int(1)
+LLAMA_SESSION_VERSION = c_int(1)
 
 llama_context_p = c_void_p
 
@@ -127,18 +127,18 @@ class llama_context_params(Structure):
 
 llama_context_params_p = POINTER(llama_context_params)
 
-LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
-LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
+LLAMA_FTYPE_ALL_F32 = c_int(0)
+LLAMA_FTYPE_MOSTLY_F16 = c_int(1)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
-LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_3 = c_int(6)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)  # except 1d tensors
 
 # Functions
 
@@ -210,8 +210,8 @@ _lib.llama_model_quantize.restype = c_int
 # Returns 0 on success
 def llama_apply_lora_from_file(
     ctx: llama_context_p,
-    path_lora: ctypes.c_char_p,
-    path_base_model: ctypes.c_char_p,
+    path_lora: c_char_p,
+    path_base_model: c_char_p,
     n_threads: c_int,
 ) -> c_int:
     return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)

From 32cf0133c9d494642e56abf0e7cb5ae0cceb0bcb Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Thu, 4 May 2023 18:33:08 +0200
Subject: [PATCH 51/77] Update low level examples

---
 examples/Chat.py                    |  70 ++++++++++
 examples/Miku.py                    |  59 ++++++++
 examples/ReasonAct.py               |  49 +++++++
 examples/common.py                  | 163 ++++++++++++++--------
 examples/low_level_api_chat_cpp.py  | 202 ++++++++++++++++++++++++----
 examples/low_level_api_llama_cpp.py |  35 +++--
 6 files changed, 486 insertions(+), 92 deletions(-)
 create mode 100644 examples/Chat.py
 create mode 100644 examples/Miku.py
 create mode 100644 examples/ReasonAct.py

diff --git a/examples/Chat.py b/examples/Chat.py
new file mode 100644
index 000000000..9283fcb8a
--- /dev/null
+++ b/examples/Chat.py
@@ -0,0 +1,70 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+    if (env in os.environ):
+        return os.environ[env]
+    return default
+
+AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "USER")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
+N_THREAD = int(env_or_def("N_THREAD", "8"))
+
+today = datetime.datetime.today()
+DATE_YEAR=today.strftime("%Y")
+DATE_TIME=today.strftime("%H:%M")
+
+prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+{USER_NAME}: Hello, {AI_NAME}!
+{AI_NAME}: Hello {USER_NAME}! How may I help you today?
+{USER_NAME}: What year is it?
+{AI_NAME}: We are in {DATE_YEAR}.
+{USER_NAME}: Please tell me the largest city in Europe.
+{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
+{USER_NAME}: What can you tell me about Moscow?
+{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
+{USER_NAME}: What is a cat?
+{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{USER_NAME}: How do I pass command line arguments to a Node.js program?
+{AI_NAME}: The arguments are stored in process.argv.
+
+    argv[0] is the path to the Node. js executable.
+    argv[1] is the path to the script file.
+    argv[2] is the first argument passed to the script.
+    argv[3] is the second argument passed to the script and so on.
+{USER_NAME}: Name a color.
+{AI_NAME}: Blue.
+{USER_NAME}: What time is it?
+{AI_NAME}: It is {DATE_TIME}.
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+    n_ctx=2048,
+    temp=0.7,
+    top_k=40,
+    top_p=0.5,
+    repeat_last_n=256,
+    n_batch=1024,
+    repeat_penalty=1.17647,
+    model=MODEL,
+    n_threads=N_THREAD,
+    n_predict=N_PREDICTS,
+    use_color=True,
+    interactive=True,
+    antiprompt=[f"{USER_NAME}:"],
+    input_prefix=" ",
+    prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+    m.interact()
diff --git a/examples/Miku.py b/examples/Miku.py
new file mode 100644
index 000000000..86fd1d74c
--- /dev/null
+++ b/examples/Miku.py
@@ -0,0 +1,59 @@
+#!/bin/python
+import sys, os
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+    if (env in os.environ):
+        return os.environ[env]
+    return default
+
+AI_NAME = env_or_def("AI_NAME", "Miku")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "Anon")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
+N_THREAD = int(env_or_def("N_THREAD", "0"))
+
+prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
+{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
+{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
+{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
+The conversation is only between {USER_NAME} and {AI_NAME}
+The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
+{AI_NAME} can only communicate through text, so she can't send images or videos.
+
+
+{USER_NAME}: Hello!
+{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
+{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
+{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
+{AI_NAME}: What do you like to do in your free time? ^_^
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+    n_batch=1024,
+    n_ctx=2048,
+    n_keep=-1,
+    repeat_last_n=256,
+    repeat_penalty=1.17647,
+    temp=0.7,
+    top_k=40,
+    top_p=0.5,
+    model=MODEL,
+    n_predict=N_PREDICTS,
+    use_color=True,
+    interactive=True,
+    antiprompt=[f"{USER_NAME}:"],
+    prompt=prompt,
+)
+
+if N_THREAD > 0:
+    params.n_threads = N_THREAD
+
+with LLaMAInteract(params) as m:
+    m.interact()
diff --git a/examples/ReasonAct.py b/examples/ReasonAct.py
new file mode 100644
index 000000000..cf0a13747
--- /dev/null
+++ b/examples/ReasonAct.py
@@ -0,0 +1,49 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+    if (env in os.environ):
+        return os.environ[env]
+    return default
+
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+
+prompt=f"""You run in a loop of Thought, Action, Observation.
+At the end of the loop either Answer or restate your Thought and Action.
+Use Thought to describe your thoughts about the question you have been asked.
+Use Action to run one of these actions available to you:
+- calculate[python math expression]
+Observation will be the result of running those actions
+
+
+Question: What is 4 * 7 / 3?
+Thought: Do I need to use an action? Yes, I use calculate to do math
+Action: calculate[4 * 7 / 3]
+Observation: 9.3333333333
+Thought: Do I need to use an action? No, have the result
+Answer: The calculate tool says it is 9.3333333333
+Question: What is capital of france?
+Thought: Do I need to use an action? No, I know the answer
+Answer: Paris is the capital of France
+Question:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+    interactive=True,
+    interactive_start=True,
+    top_k=10000,
+    temp=0.2,
+    repeat_penalty=1,
+    n_threads=7,
+    n_ctx=2048,
+    antiprompt=["Question:","Observation:"],
+    model=MODEL,
+    input_prefix=" ",
+    n_predict=-1,
+    prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+    m.interact()
diff --git a/examples/common.py b/examples/common.py
index 061ec3ae9..9a465db6e 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -1,8 +1,9 @@
 import os
 import argparse
+import re
 
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List
 
 # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
 
@@ -12,23 +13,35 @@ class GptParams:
     seed: int = -1
     n_threads: int = min(4, os.cpu_count() or 1)
     n_predict: int = 128
-    repeat_last_n: int = 64
     n_parts: int = -1
     n_ctx: int = 512
     n_batch: int = 8
     n_keep: int = 0
 
+    ignore_eos: bool = False
+    logit_bias: dict[int, float] = field(default_factory=dict)
     top_k: int = 40
     top_p: float = 0.95
+    tfs_z: float = 1.00
+    typical_p: float = 1.00
     temp: float = 0.80
     repeat_penalty: float = 1.10
+    repeat_last_n: int = 64
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    mirostat: int = 0
+    mirostat_tau: float = 5.0
+    mirostat_eta: float = 0.1
 
     model: str = "./models/llama-7B/ggml-model.bin"
     prompt: str = ""
+    path_session: str = ""
     input_prefix: str = " "
-
     antiprompt: List[str] = field(default_factory=list)
 
+    lora_adapter: str = ""
+    lora_base: str = ""
+
     memory_f16: bool = True
     random_prompt: bool = False
     use_color: bool = False
@@ -38,7 +51,7 @@ class GptParams:
     interactive_start: bool = False
 
     instruct: bool = False
-    ignore_eos: bool = False
+    penalize_nl: bool = True
     perplexity: bool = False
     use_mmap: bool = True
     use_mlock: bool = False
@@ -61,59 +74,42 @@ class GptParams:
     instruct_inp_suffix: str="\n\n### Response:\n\n"
 
 
-def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
-    if params is None:
-        params = GptParams()
-
+def gpt_params_parse(argv = None):
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
     parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
-    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
+    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
     parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
-    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
-    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
-    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
-    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
-    parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
-    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
-    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
     parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
     parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
+
+    parser.add_argument(
+        "-l",
+        "--logit-bias",
+        type=str,
+        action='append',
+        help="--logit-bias TOKEN_ID(+/-)BIAS",
+        dest="logit_bias_str"
+    )
+    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
+    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
+    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
+    parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
+    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
+    parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
+    parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
+    parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau")
+    parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
+
     parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
-    parser.add_argument(
-        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
-    )
-    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
-    parser.add_argument(
-        "--interactive-start",
-        action="store_true",
-        help="run in interactive mode",
-        dest="interactive"
-    )
-    parser.add_argument(
-        "--interactive-first",
-        action="store_true",
-        help="run in interactive mode and wait for input right away",
-        dest="interactive_start"
-    )
-    parser.add_argument(
-        "-ins",
-        "--instruct",
-        action="store_true",
-        help="run in instruction mode (use with Alpaca or Vicuna models)",
-        dest="instruct"
-    )
-    parser.add_argument(
-        "--color",
-        action="store_true",
-        help="colorise output to distinguish prompt and user input from generations",
-        dest="use_color"
-    )
-    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
-    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
-    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
-    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
+    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
     parser.add_argument(
         "-r",
         "--reverse-prompt",
@@ -122,16 +118,71 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
         help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
         dest="antiprompt"
     )
-    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
-    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
-    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
+
+    parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter")
+    parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base")
+
+    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
     parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
-    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument(
+        "--color",
+        action="store_true",
+        help="colorise output to distinguish prompt and user input from generations",
+        dest="use_color"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
+    )
+
+    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
+    parser.add_argument(
+        "--interactive-first",
+        action="store_true",
+        help="run in interactive mode and wait for input right away",
+        dest="interactive_start"
+    )
+
+    parser.add_argument(
+        "-ins",
+        "--instruct",
+        action="store_true",
+        help="run in instruction mode (use with Alpaca or Vicuna models)",
+        dest="instruct"
+    )
+    parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl")
+    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
+    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
+    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
+    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
+
+    #Custom args
     parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
     parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
     parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
+
+    parser.add_argument(
+        "--interactive-start",
+        action="store_true",
+        help="run in interactive mode",
+        dest="interactive"
+    )
+
     args = parser.parse_args(argv)
-    return args
+
+    logit_bias_str = args.logit_bias_str
+    delattr(args, "logit_bias_str")
+    params = GptParams(**vars(args))
+
+    if (params.lora_adapter):
+        params.use_mmap = False
+
+    if (logit_bias_str != None):
+        for i in logit_bias_str:
+            if (m := re.match(r"(\d+)([-+]\d+)", i)):
+                params.logit_bias[int(m.group(1))] = int(m.group(2))
+
+    return params
 
 def gpt_random_prompt(rng):
     return [
@@ -148,4 +199,4 @@ def gpt_random_prompt(rng):
     ][rng % 10]
 
 if __name__ == "__main__":
-    print(GptParams(gpt_params_parse()))
+    print(gpt_params_parse())
diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index e046c2a79..d024f0860 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -10,9 +10,10 @@ Quirks:
    You should also still be feeding the model with a "primer" prompt that
    shows it the expected format.
 """
+import ctypes
 import sys
 from time import time
-from os import cpu_count
+from os import cpu_count, path
 
 import llama_cpp
 from common import GptParams, gpt_params_parse, gpt_random_prompt
@@ -77,6 +78,7 @@ specified) expect poor results""", file=sys.stderr)
         # runtime args
         self.input_consumed = 0
         self.n_past = 0
+        self.n_session_consumed = 0
         self.first_antiprompt = []
         self.remaining_tokens = self.params.n_predict
         self.output_echo = self.params.input_echo
@@ -94,6 +96,19 @@ specified) expect poor results""", file=sys.stderr)
         if (not self.ctx):
             raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
+        if (self.params.ignore_eos):
+            self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
+
+        if (len(self.params.lora_adapter) > 0):
+            if (llama_cpp.llama_apply_lora_from_file(
+                self.ctx,
+                self.params.lora_adapter,
+                self.params.lora_base if len(self.params.lora_base) > 0 else None,
+                self.params.n_threads
+            ) != 0):
+                print("error: failed to apply lora adapter")
+                return
+
         print(file=sys.stderr)
         print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
 | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
@@ -117,13 +132,49 @@ specified) expect poor results""", file=sys.stderr)
             with open(self.params.file) as f:
                 self.params.prompt = f.read()
 
+        self.session_tokens: list[llama_cpp.llama_token] = []
+        if (len(self.params.path_session) > 0):
+            print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr)
+
+            if (path.exists(self.params.path_session)):
+                _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
+                _n_token_count_out = llama_cpp.c_int()
+                if (llama_cpp.llama_load_session_file(
+                    self.ctx,
+                    self.params.path_session.encode("utf8"),
+                    _session_tokens,
+                    self.params.n_ctx,
+                    ctypes.byref(_n_token_count_out)
+                ) != 0):
+                    print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
+                    return
+                self.session_tokens = _session_tokens[:_n_token_count_out]
+                print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
+            else:
+                print(f"session file does not exist, will create", file=sys.stderr)
+
         # tokenize the prompt
         self.embd = []
         self.embd_inp = self._tokenize(self.params.prompt)
 
-        if (len(self.embd_inp) > self.params.n_ctx - 4):
+        if (len(self.embd_inp) > self.n_ctx - 4):
             raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
 
+        # debug message about similarity of saved session, if applicable
+        n_matching_session_tokens = 0
+        if len(self.session_tokens) > 0:
+            for id in self.session_tokens:
+                if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]:
+                    break
+                n_matching_session_tokens += 1
+
+            if n_matching_session_tokens >= len(self.embd_inp):
+                print(f"session file has exact match for prompt!")
+            elif n_matching_session_tokens < (len(self.embd_inp) / 2):
+                print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
+            else:
+                print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+
         # number of tokens to keep when resetting context
         if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
             self.params.n_keep = len(self.embd_inp)
@@ -132,6 +183,7 @@ specified) expect poor results""", file=sys.stderr)
         self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
 
         # in instruct mode, we inject a prefix and a suffix to each input by the user
+        self.antiecho = None
         if (self.params.instruct):
             self.params.interactive_start = True
             _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
@@ -171,16 +223,24 @@ number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
             if len(self.params.input_prefix) > 0:
                 print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
 
-        print(f"""sampling: temp = {self.params.temp},\
+        print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
+repeat_penalty = {self.params.repeat_penalty},\
+presence_penalty = {self.params.presence_penalty},\
+frequency_penalty = {self.params.frequency_penalty},\
 top_k = {self.params.top_k},\
+tfs_z = {self.params.tfs_z},\
 top_p = {self.params.top_p},\
-repeat_last_n = {self.params.repeat_last_n},\
-repeat_penalty = {self.params.repeat_penalty}
+typical_p = {self.params.typical_p},\
+temp = {self.params.temp},\
+mirostat = {self.params.mirostat},\
+mirostat_lr = {self.params.mirostat_eta},\
+mirostat_ent = {self.params.mirostat_tau},\
 
-generate: n_ctx = {self.n_ctx}, \
-n_batch = {self.params.n_batch}, \
-n_predict = {self.params.n_predict}, \
+generate: n_ctx = {self.n_ctx},\
+n_batch = {self.params.n_batch},\
+n_predict = {self.params.n_predict},\
 n_keep = {self.params.n_keep}
+
 """, file=sys.stderr)
 
         # determine antiprompt tokens
@@ -198,6 +258,9 @@ n_keep = {self.params.n_keep}
 """, file=sys.stderr)
         self.set_color(CONSOLE_COLOR_PROMPT)
 
+        self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
+
+
     # tokenize a prompt
     def _tokenize(self, prompt, bos=True):
         _arr = (llama_cpp.llama_token * (len(prompt) + 1))()
@@ -229,31 +292,117 @@ n_keep = {self.params.n_keep}
                         self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
                     ]
                     self.embd = _insert + self.embd
+                    self.params.path_session = ""
+
+                # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+                # REVIEW
+                if self.n_session_consumed < len(self.session_tokens):
+                    for i in range(len(self.embd)):
+                        if self.embd[i] != self.session_tokens[self.n_session_consumed]:
+                            self.session_tokens = self.session_tokens[:self.n_session_consumed]
+                            break
+
+                        self.n_past += 1
+                        self.n_session_consumed += 1
+
+                        if self.n_session_consumed >= len(self.session_tokens):
+                            i += 1
+                            break
+
+                    if i > 0:
+                        self.embd = self.embd[i:]
+
+                # evaluate tokens in batches
+                # embd is typically prepared beforehand to fit within a batch, but not always
+                #TODO BUG: The batching code causes nonsensical generation
+                """for i in range(0, len(self.embd), self.params.n_batch):
+                    n_eval = self.params.n_batch
+                    _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
+                    if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
+                        print(f"failed to eval")
+                        return
+
+                    self.n_past += n_eval"""
 
                 if (llama_cpp.llama_eval(
                     self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
                 ) != 0):
                     raise Exception("Failed to llama_eval!")
 
+                if len(self.embd) > 0 and not len(self.params.path_session) > 0:
+                    self.session_tokens.extend(self.embd)
+                    self.n_session_consumed = len(self.session_tokens)
+
             self.n_past += len(self.embd)
             self.embd = []
-            if len(self.embd_inp) <= self.input_consumed:
+            if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting
                 # out of user input, sample next token
+                top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k
+                repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n
 
-                if (self.params.ignore_eos):
-                    logits = llama_cpp.llama_get_logits(self.ctx)
-                    logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0)
+                # optionally save the session on first sample (for faster prompt loading next time)
+                if len(self.params.path_session) > 0 and self.need_to_save_session:
+                    self.need_to_save_session = False
+                    llama_cpp.llama_save_session_file(
+                        self.ctx,
+                        self.params.path_session.encode("utf8"),
+                        self.session_tokens,
+                        len(self.session_tokens)
+                    )
+
+                id = 0
+
+                logits = llama_cpp.llama_get_logits(self.ctx)
+                n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+
+                # Apply params.logit_bias map
+                for key, value in self.params.logit_bias.items():
+                    logits[key] += value
+
+                _arr = (llama_cpp.llama_token_data * n_vocab)(*[
+                    llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+                    for token_id in range(n_vocab)
+                ])
+                candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+                # Apply penalties
+                nl_logit = logits[llama_cpp.llama_token_nl()]
+                last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
+
+                _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
+                llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
+                    _arr,
+                    last_n_repeat, self.params.repeat_penalty)
+                llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
+                    _arr,
+                    last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty)
+
+                if not self.params.penalize_nl:
+                    logits[llama_cpp.llama_token_nl()] = nl_logit
+
+                if self.params.temp <= 0:
+                    # Greedy sampling
+                    id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
+                else:
+                    if self.params.mirostat == 1:
+                        mirostat_mu = 2.0 * self.params.mirostat_tau
+                        mirostat_m = 100
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+                        id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu)
+                    elif self.params.mirostat == 2:
+                        mirostat_mu = 2.0 * self.params.mirostat_tau
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+                        id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu)
+                    else:
+                        # Temperature sampling
+                        llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
+                        llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z)
+                        llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p)
+                        llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p)
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+                        id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
+                # print("`{}`".format(candidates_p.size))
 
-                _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):]
-                id = llama_cpp.llama_sample_top_p_top_k(
-                    self.ctx,
-                    (llama_cpp.llama_token * len(_arr))(*_arr),
-                    len(_arr),
-                    self.params.top_k,
-                    self.params.top_p,
-                    self.params.temp,
-                    self.params.repeat_penalty,
-                )
                 self.last_n_tokens.pop(0)
                 self.last_n_tokens.append(id)
 
@@ -288,7 +437,7 @@ n_keep = {self.params.n_keep}
             # display tokens
             if self.output_echo:
                 for id in self.embd:
-                    if self.params.instruct:
+                    if self.antiecho != None:
                         for r in self.antiecho(id):
                             yield r
                     else:
@@ -316,7 +465,7 @@ n_keep = {self.params.n_keep}
                 if (not self.params.instruct):
                     for i in self.llama_token_eot:
                         yield i
-                break
+                    break
 
             # respect n_predict even if antiprompt is present
             if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
@@ -356,7 +505,7 @@ n_keep = {self.params.n_keep}
     def output(self):
         self.remaining_tokens = self.params.n_predict
         for id in self.generate():
-            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
+            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
 
     # read user input
     def read_input(self):
@@ -415,8 +564,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
 {USER_NAME}:"""
-    args = gpt_params_parse()
-    params = GptParams(**vars(args))
+    params = gpt_params_parse()
 
     with LLaMAInteract(params) as m:
         m.interact()
diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py
index 4fb5a0366..9e38ec7cb 100644
--- a/examples/low_level_api_llama_cpp.py
+++ b/examples/low_level_api_llama_cpp.py
@@ -37,6 +37,10 @@ embd = []
 last_n_size = 64
 last_n_tokens_data = [0] * last_n_size
 n_batch = 24
+last_n_repeat = 64
+repeat_penalty = 1
+frequency_penalty = 0.0
+presence_penalty = 0.0
 
 while remaining_tokens > 0:
     if len(embd) > 0:
@@ -47,15 +51,28 @@ while remaining_tokens > 0:
     n_past += len(embd)
     embd = []
     if len(embd_inp) <= input_consumed:
-        id = llama_cpp.llama_sample_top_p_top_k(
-            ctx,
-            (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data),
-            len(last_n_tokens_data),
-            40,
-            0.8,
-            0.2,
-            1.0 / 0.85,
-        )
+        logits = llama_cpp.llama_get_logits(ctx)
+        n_vocab = llama_cpp.llama_n_vocab(ctx)
+
+        _arr = (llama_cpp.llama_token_data * n_vocab)(*[
+            llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+            for token_id in range(n_vocab)
+        ])
+        candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+        _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
+        llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
+            _arr,
+            last_n_repeat, repeat_penalty)
+        llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
+            _arr,
+            last_n_repeat, frequency_penalty, presence_penalty)
+
+        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40)
+        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8)
+        llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
+        id = llama_cpp.llama_sample_token(ctx, candidates_p)
+
         last_n_tokens_data = last_n_tokens_data[1:] + [id]
         embd.append(id)
         input_noecho = False

From 335cd8d947cc2cf4608885629dc9e63eaa061150 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 13:18:25 +0200
Subject: [PATCH 52/77] Rename postfix to suffix to match upstream

---
 examples/Chat.py                   | 1 +
 examples/common.py                 | 4 ++--
 examples/low_level_api_chat_cpp.py | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/Chat.py b/examples/Chat.py
index 9283fcb8a..c78347168 100644
--- a/examples/Chat.py
+++ b/examples/Chat.py
@@ -63,6 +63,7 @@ params = GptParams(
     interactive=True,
     antiprompt=[f"{USER_NAME}:"],
     input_prefix=" ",
+    input_suffix=f"{AI_NAME}:",
     prompt=prompt,
 )
 
diff --git a/examples/common.py b/examples/common.py
index 9a465db6e..75a952583 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -37,6 +37,7 @@ class GptParams:
     prompt: str = ""
     path_session: str = ""
     input_prefix: str = " "
+    input_suffix: str = ""
     antiprompt: List[str] = field(default_factory=list)
 
     lora_adapter: str = ""
@@ -64,7 +65,6 @@ class GptParams:
     # Set to "\nUser:" etc.
     # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
     fix_prefix: str = ""
-    output_postfix: str = ""
     input_echo: bool = True,
 
     # Default instructions for Alpaca
@@ -110,6 +110,7 @@ def gpt_params_parse(argv = None):
     parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
     parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
     parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
     parser.add_argument(
         "-r",
         "--reverse-prompt",
@@ -158,7 +159,6 @@ def gpt_params_parse(argv = None):
 
     #Custom args
     parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
-    parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
     parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
 
     parser.add_argument(
diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index d024f0860..d3a7d466f 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -527,8 +527,8 @@ n_keep = {self.params.n_keep}
                 self.input(self.read_input())
             else:
                 print(self.params.input_prefix, end="")
-                self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}")
-                print(self.params.output_postfix,end="")
+                self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
+                print(self.params.input_suffix,end="")
             self.set_color(CONSOLE_COLOR_DEFAULT)
 
             try:

From bbf6848cb07b8fd73f80baa5d546eacd27f9c8b2 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 13:27:52 +0200
Subject: [PATCH 53/77] Wrong logit_bias parsed type

---
 examples/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/common.py b/examples/common.py
index 75a952583..b51c28b16 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -180,7 +180,7 @@ def gpt_params_parse(argv = None):
     if (logit_bias_str != None):
         for i in logit_bias_str:
             if (m := re.match(r"(\d+)([-+]\d+)", i)):
-                params.logit_bias[int(m.group(1))] = int(m.group(2))
+                params.logit_bias[int(m.group(1))] = float(m.group(2))
 
     return params
 

From f8ba031576fadd86601664a43916d4489387fa19 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Mon, 8 May 2023 15:27:42 +0200
Subject: [PATCH 54/77] Fix lora

---
 examples/low_level_api_chat_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index d3a7d466f..12f7e4510 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -102,8 +102,8 @@ specified) expect poor results""", file=sys.stderr)
         if (len(self.params.lora_adapter) > 0):
             if (llama_cpp.llama_apply_lora_from_file(
                 self.ctx,
-                self.params.lora_adapter,
-                self.params.lora_base if len(self.params.lora_base) > 0 else None,
+                self.params.lora_adapter.encode("utf8"),
+                self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None,
                 self.params.n_threads
             ) != 0):
                 print("error: failed to apply lora adapter")

From 0bf36a77aead8bdd2f73c3a960afb32bf10de916 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 13:35:50 +0200
Subject: [PATCH 55/77] Fix mirastat requiring c_float

---
 examples/low_level_api_chat_cpp.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 12f7e4510..c55ca2fdf 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -372,10 +372,10 @@ n_keep = {self.params.n_keep}
                 _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
                 llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
                     _arr,
-                    last_n_repeat, self.params.repeat_penalty)
+                    last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
                 llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
                     _arr,
-                    last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty)
+                    last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 
                 if not self.params.penalize_nl:
                     logits[llama_cpp.llama_token_nl()] = nl_logit
@@ -387,19 +387,19 @@ n_keep = {self.params.n_keep}
                     if self.params.mirostat == 1:
                         mirostat_mu = 2.0 * self.params.mirostat_tau
                         mirostat_m = 100
-                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
-                        id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu)
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+                        id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu))
                     elif self.params.mirostat == 2:
                         mirostat_mu = 2.0 * self.params.mirostat_tau
-                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
-                        id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu)
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+                        id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
                     else:
                         # Temperature sampling
                         llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
-                        llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z)
-                        llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p)
-                        llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p)
-                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+                        llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
+                        llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
+                        llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
                         id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
                 # print("`{}`".format(candidates_p.size))
 

From fb79c567d242f85b5d00e8b60f231a8560918250 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Mon, 8 May 2023 15:27:03 +0200
Subject: [PATCH 56/77] Fix session loading and saving in low level example
 chat

---
 examples/low_level_api_chat_cpp.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index c55ca2fdf..205a5b76c 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -138,16 +138,17 @@ specified) expect poor results""", file=sys.stderr)
 
             if (path.exists(self.params.path_session)):
                 _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
-                _n_token_count_out = llama_cpp.c_int()
+                _n_token_count_out = llama_cpp.c_size_t()
                 if (llama_cpp.llama_load_session_file(
                     self.ctx,
                     self.params.path_session.encode("utf8"),
                     _session_tokens,
                     self.params.n_ctx,
                     ctypes.byref(_n_token_count_out)
-                ) != 0):
+                ) != 1):
                     print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
                     return
+                _n_token_count_out = _n_token_count_out.value
                 self.session_tokens = _session_tokens[:_n_token_count_out]
                 print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
             else:
@@ -161,19 +162,21 @@ specified) expect poor results""", file=sys.stderr)
             raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
 
         # debug message about similarity of saved session, if applicable
-        n_matching_session_tokens = 0
+        self.n_matching_session_tokens = 0
         if len(self.session_tokens) > 0:
             for id in self.session_tokens:
-                if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]:
+                if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]:
                     break
-                n_matching_session_tokens += 1
+                self.n_matching_session_tokens += 1
 
-            if n_matching_session_tokens >= len(self.embd_inp):
+            if self.n_matching_session_tokens >= len(self.embd_inp):
                 print(f"session file has exact match for prompt!")
-            elif n_matching_session_tokens < (len(self.embd_inp) / 2):
-                print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
+            elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
+                print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
             else:
-                print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+                print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+
+        self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
 
         # number of tokens to keep when resetting context
         if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
@@ -258,9 +261,6 @@ n_keep = {self.params.n_keep}
 """, file=sys.stderr)
         self.set_color(CONSOLE_COLOR_PROMPT)
 
-        self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
-
-
     # tokenize a prompt
     def _tokenize(self, prompt, bos=True):
         _arr = (llama_cpp.llama_token * (len(prompt) + 1))()
@@ -329,7 +329,7 @@ n_keep = {self.params.n_keep}
                 ) != 0):
                     raise Exception("Failed to llama_eval!")
 
-                if len(self.embd) > 0 and not len(self.params.path_session) > 0:
+                if len(self.embd) > 0 and len(self.params.path_session) > 0:
                     self.session_tokens.extend(self.embd)
                     self.n_session_consumed = len(self.session_tokens)
 
@@ -346,7 +346,7 @@ n_keep = {self.params.n_keep}
                     llama_cpp.llama_save_session_file(
                         self.ctx,
                         self.params.path_session.encode("utf8"),
-                        self.session_tokens,
+                        (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens),
                         len(self.session_tokens)
                     )
 

From b5531e14350531943953846301c94c96f6ab2aca Mon Sep 17 00:00:00 2001
From: Don Mahurin <@>
Date: Fri, 26 May 2023 06:35:15 -0700
Subject: [PATCH 57/77] low_level_api_chat_cpp.py: Fix missing antiprompt
 output in chat.

---
 examples/low_level_api_chat_cpp.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index 205a5b76c..e67cd8e43 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -409,12 +409,15 @@ n_keep = {self.params.n_keep}
                 # replace end of text token with newline token when in interactive mode
                 if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
                     id = self.llama_token_newline[0]
+                    self.embd.append(id)
                     if (self.use_antiprompt()):
                         # tokenize and inject first reverse prompt
                         self.embd_inp += self.first_antiprompt[0]
-
-                # add it to the context
-                self.embd.append(id)
+                        for id in self.first_antiprompt[0]:
+                            self.embd.append(id)
+                else:
+                    # add it to the context
+                    self.embd.append(id)
 
                 # echo this to console
                 self.output_echo = True

From a439fe15295657bf6cdc4d06a7d6cce92c8c6902 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 12 May 2023 14:28:22 -0400
Subject: [PATCH 58/77] Allow model to tokenize strings longer than context
 length and set add_bos. Closes #92

---
 examples/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index a56243dc9..f2366effe 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -337,7 +337,7 @@ def llama_tokenize(
     tokens,  # type: Array[llama_token]
     n_max_tokens: c_int,
     add_bos: c_bool,
-) -> c_int:
+) -> int:
     return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
 
 

From 731c71255b86000d956baf9ddd75992296796288 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 12:22:27 -0400
Subject: [PATCH 59/77] Add types for all low-level api functions

---
 examples/llama_cpp.py | 81 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 20 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index f2366effe..fce7fce1c 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -17,7 +17,7 @@ import pathlib
 
 
 # Load the library
-def _load_shared_library(lib_base_name):
+def _load_shared_library(lib_base_name: str):
     # Determine the file extension based on the platform
     if sys.platform.startswith("linux"):
         lib_ext = ".so"
@@ -252,7 +252,9 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
-def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t:
+def llama_copy_state_data(
+    ctx: llama_context_p, dest  # type: Array[c_uint8]
+) -> c_size_t:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -262,7 +264,9 @@ _lib.llama_copy_state_data.restype = c_size_t
 
 # Set the state reading from the specified address
 # Returns the number of bytes read
-def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t:
+def llama_set_state_data(
+    ctx: llama_context_p, src  # type: Array[c_uint8]
+) -> c_size_t:
     return _lib.llama_set_state_data(ctx, src)
 
 
@@ -274,9 +278,9 @@ _lib.llama_set_state_data.restype = c_size_t
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens_out,
+    tokens_out,  # type: Array[llama_token]
     n_token_capacity: c_size_t,
-    n_token_count_out,
+    n_token_count_out,  # type: Array[c_size_t]
 ) -> c_size_t:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
@@ -294,7 +298,10 @@ _lib.llama_load_session_file.restype = c_size_t
 
 
 def llama_save_session_file(
-    ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t
+    ctx: llama_context_p,
+    path_session: bytes,
+    tokens,  # type: Array[llama_token]
+    n_token_count: c_size_t,
 ) -> c_size_t:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
 
@@ -433,8 +440,8 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates,
-    last_tokens_data,
+    candidates, # type: Array[llama_token_data]
+    last_tokens_data, # type: Array[llama_token]
     last_tokens_size: c_int,
     penalty: c_float,
 ):
@@ -456,8 +463,8 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates,
-    last_tokens_data,
+    candidates, # type: Array[llama_token_data]
+    last_tokens_data, # type: Array[llama_token]
     last_tokens_size: c_int,
     alpha_frequency: c_float,
     alpha_presence: c_float,
@@ -484,7 +491,10 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-def llama_sample_softmax(ctx: llama_context_p, candidates):
+def llama_sample_softmax(
+    ctx: llama_context_p,
+    candidates # type: Array[llama_token_data]
+):
     return _lib.llama_sample_softmax(ctx, candidates)
 
 
@@ -497,7 +507,10 @@ _lib.llama_sample_softmax.restype = None
 
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
-    ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates,  # type: Array[llama_token_data]
+    k: c_int,
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
@@ -513,7 +526,10 @@ _lib.llama_sample_top_k.restype = None
 
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
-    ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    p: c_float,
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
@@ -529,7 +545,10 @@ _lib.llama_sample_top_p.restype = None
 
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
-    ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    z: c_float,
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -545,7 +564,10 @@ _lib.llama_sample_tail_free.restype = None
 
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
-    ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    p: c_float,
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
@@ -559,7 +581,11 @@ _lib.llama_sample_typical.argtypes = [
 _lib.llama_sample_typical.restype = None
 
 
-def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float):
+def llama_sample_temperature(
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    temp: c_float
+):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
 
@@ -578,7 +604,12 @@ _lib.llama_sample_temperature.restype = None
 # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
-    ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    tau: c_float,
+    eta: c_float,
+    m: c_int,
+    mu # type: Array[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
@@ -600,7 +631,11 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
-    ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    tau: c_float,
+    eta: c_float,
+    mu # type: Array[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
@@ -616,7 +651,10 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 
 # @details Selects the token with the highest probability.
-def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token:
+def llama_sample_token_greedy(
+    ctx: llama_context_p,
+    candidates # type: Array[llama_token_data]
+) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
 
@@ -628,7 +666,10 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 
 # @details Randomly selects a token from the candidates based on their probabilities.
-def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token:
+def llama_sample_token(
+    ctx: llama_context_p,
+    candidates # type: Array[llama_token_data]
+) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 
 

From f20b34a3beb550761e11c2f0dee55ed755670a8c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:22:55 -0400
Subject: [PATCH 60/77] Add return type annotations for embeddings and logits

---
 examples/llama_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index fce7fce1c..e6638ed17 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -381,7 +381,7 @@ _lib.llama_n_embd.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
-def llama_get_logits(ctx: llama_context_p):
+def llama_get_logits(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_logits(ctx)
 
 
@@ -391,7 +391,7 @@ _lib.llama_get_logits.restype = POINTER(c_float)
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
-def llama_get_embeddings(ctx: llama_context_p):
+def llama_get_embeddings(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_embeddings(ctx)
 
 

From 7862b520ec021f3fe76507e6857ebb1f677de6b7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 13:54:22 -0400
Subject: [PATCH 61/77] Fix llama_cpp types

---
 examples/llama_cpp.py | 60 +++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index e6638ed17..6b3994f13 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -8,6 +8,7 @@ from ctypes import (
     c_void_p,
     c_bool,
     POINTER,
+    _Pointer,  # type: ignore
     Structure,
     Array,
     c_uint8,
@@ -252,9 +253,7 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
-def llama_copy_state_data(
-    ctx: llama_context_p, dest  # type: Array[c_uint8]
-) -> c_size_t:
+def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -278,9 +277,9 @@ _lib.llama_set_state_data.restype = c_size_t
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens_out,  # type: Array[llama_token]
+    tokens_out: Array[llama_token],
     n_token_capacity: c_size_t,
-    n_token_count_out,  # type: Array[c_size_t]
+    n_token_count_out: _Pointer[c_size_t],
 ) -> c_size_t:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
@@ -300,7 +299,7 @@ _lib.llama_load_session_file.restype = c_size_t
 def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens,  # type: Array[llama_token]
+    tokens: Array[llama_token],
     n_token_count: c_size_t,
 ) -> c_size_t:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
@@ -321,7 +320,7 @@ _lib.llama_save_session_file.restype = c_size_t
 # Returns 0 on success
 def llama_eval(
     ctx: llama_context_p,
-    tokens,  # type: Array[llama_token]
+    tokens: Array[llama_token],
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
@@ -440,8 +439,8 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
-    last_tokens_data, # type: Array[llama_token]
+    candidates: _Pointer[llama_token_data],
+    last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     penalty: c_float,
 ):
@@ -463,8 +462,8 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
-    last_tokens_data, # type: Array[llama_token]
+    candidates: _Pointer[llama_token_data],
+    last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     alpha_frequency: c_float,
     alpha_presence: c_float,
@@ -491,10 +490,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-def llama_sample_softmax(
-    ctx: llama_context_p,
-    candidates # type: Array[llama_token_data]
-):
+def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]):
     return _lib.llama_sample_softmax(ctx, candidates)
 
 
@@ -508,9 +504,9 @@ _lib.llama_sample_softmax.restype = None
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
     ctx: llama_context_p,
-    candidates,  # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     k: c_int,
-    min_keep: c_size_t = c_size_t(1)
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
@@ -527,9 +523,9 @@ _lib.llama_sample_top_k.restype = None
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     p: c_float,
-    min_keep: c_size_t = c_size_t(1)
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
@@ -546,9 +542,9 @@ _lib.llama_sample_top_p.restype = None
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     z: c_float,
-    min_keep: c_size_t = c_size_t(1)
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -565,9 +561,9 @@ _lib.llama_sample_tail_free.restype = None
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     p: c_float,
-    min_keep: c_size_t = c_size_t(1)
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
@@ -582,9 +578,7 @@ _lib.llama_sample_typical.restype = None
 
 
 def llama_sample_temperature(
-    ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
-    temp: c_float
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float
 ):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
@@ -605,11 +599,11 @@ _lib.llama_sample_temperature.restype = None
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     tau: c_float,
     eta: c_float,
     m: c_int,
-    mu # type: Array[c_float]
+    mu: _Pointer[c_float],
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
@@ -632,10 +626,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     tau: c_float,
     eta: c_float,
-    mu # type: Array[c_float]
+    mu: _Pointer[c_float],
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
@@ -652,8 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 # @details Selects the token with the highest probability.
 def llama_sample_token_greedy(
-    ctx: llama_context_p,
-    candidates # type: Array[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
 ) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
@@ -667,8 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 # @details Randomly selects a token from the candidates based on their probabilities.
 def llama_sample_token(
-    ctx: llama_context_p,
-    candidates # type: Array[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
 ) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 

From ff31330d7f6e2b6e6279ca9d00838f1723adff15 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:00:30 -0400
Subject: [PATCH 62/77] Fix candidates type

---
 examples/llama_cpp.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 6b3994f13..66bb82cf5 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -439,7 +439,7 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     penalty: c_float,
@@ -462,7 +462,7 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     alpha_frequency: c_float,
@@ -504,7 +504,7 @@ _lib.llama_sample_softmax.restype = None
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     k: c_int,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -523,7 +523,7 @@ _lib.llama_sample_top_k.restype = None
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -542,7 +542,7 @@ _lib.llama_sample_top_p.restype = None
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     z: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -561,7 +561,7 @@ _lib.llama_sample_tail_free.restype = None
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -578,7 +578,7 @@ _lib.llama_sample_typical.restype = None
 
 
 def llama_sample_temperature(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float
 ):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
@@ -599,7 +599,7 @@ _lib.llama_sample_temperature.restype = None
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     tau: c_float,
     eta: c_float,
     m: c_int,
@@ -626,7 +626,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     tau: c_float,
     eta: c_float,
     mu: _Pointer[c_float],
@@ -646,7 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 # @details Selects the token with the highest probability.
 def llama_sample_token_greedy(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
@@ -660,7 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 # @details Randomly selects a token from the candidates based on their probabilities.
 def llama_sample_token(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 

From 0c2fb05361df1327c07f66f10c733a601a30f601 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:04:12 -0400
Subject: [PATCH 63/77] Fix: types

---
 examples/llama_cpp.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 66bb82cf5..30e8f47be 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -141,6 +141,11 @@ LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)  # except 1d tensors
 
+# Misc
+c_float_p = POINTER(c_float)
+c_uint8_p = POINTER(c_uint8)
+c_size_t_p = POINTER(c_size_t)
+
 # Functions
 
 
@@ -257,7 +262,7 @@ def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_
     return _lib.llama_copy_state_data(ctx, dest)
 
 
-_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)]
+_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
 _lib.llama_copy_state_data.restype = c_size_t
 
 
@@ -269,7 +274,7 @@ def llama_set_state_data(
     return _lib.llama_set_state_data(ctx, src)
 
 
-_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)]
+_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p]
 _lib.llama_set_state_data.restype = c_size_t
 
 
@@ -291,7 +296,7 @@ _lib.llama_load_session_file.argtypes = [
     c_char_p,
     llama_token_p,
     c_size_t,
-    POINTER(c_size_t),
+    c_size_t_p,
 ]
 _lib.llama_load_session_file.restype = c_size_t
 
@@ -340,7 +345,7 @@ _lib.llama_eval.restype = c_int
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
-    tokens,  # type: Array[llama_token]
+    tokens: Array[llama_token],
     n_max_tokens: c_int,
     add_bos: c_bool,
 ) -> int:
@@ -385,7 +390,7 @@ def llama_get_logits(ctx: llama_context_p):  # type: (...) -> Array[float] # typ
 
 
 _lib.llama_get_logits.argtypes = [llama_context_p]
-_lib.llama_get_logits.restype = POINTER(c_float)
+_lib.llama_get_logits.restype = c_float_p
 
 
 # Get the embeddings for the input
@@ -395,7 +400,7 @@ def llama_get_embeddings(ctx: llama_context_p):  # type: (...) -> Array[float] #
 
 
 _lib.llama_get_embeddings.argtypes = [llama_context_p]
-_lib.llama_get_embeddings.restype = POINTER(c_float)
+_lib.llama_get_embeddings.restype = c_float_p
 
 
 # Token Id -> String. Uses the vocabulary in the provided context
@@ -614,7 +619,7 @@ _lib.llama_sample_token_mirostat.argtypes = [
     c_float,
     c_float,
     c_int,
-    POINTER(c_float),
+    c_float_p,
 ]
 _lib.llama_sample_token_mirostat.restype = llama_token
 
@@ -639,7 +644,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [
     llama_token_data_array_p,
     c_float,
     c_float,
-    POINTER(c_float),
+    c_float_p,
 ]
 _lib.llama_sample_token_mirostat_v2.restype = llama_token
 

From 4885e55ccdfdaa21e115c16fe42e0dd8e5e16339 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:12:26 -0400
Subject: [PATCH 64/77] Fix: runtime type errors

---
 examples/llama_cpp.py | 52 +++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 30e8f47be..62069a471 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -258,7 +258,9 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
-def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t:
+def llama_copy_state_data(
+    ctx: llama_context_p, dest  # type: Array[c_uint8]
+) -> c_size_t:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -282,9 +284,9 @@ _lib.llama_set_state_data.restype = c_size_t
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens_out: Array[llama_token],
+    tokens_out,  # type: Array[llama_token]
     n_token_capacity: c_size_t,
-    n_token_count_out: _Pointer[c_size_t],
+    n_token_count_out,  # type: _Pointer[c_size_t]
 ) -> c_size_t:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
@@ -304,7 +306,7 @@ _lib.llama_load_session_file.restype = c_size_t
 def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens: Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_token_count: c_size_t,
 ) -> c_size_t:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
@@ -325,7 +327,7 @@ _lib.llama_save_session_file.restype = c_size_t
 # Returns 0 on success
 def llama_eval(
     ctx: llama_context_p,
-    tokens: Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
@@ -345,7 +347,7 @@ _lib.llama_eval.restype = c_int
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
-    tokens: Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_max_tokens: c_int,
     add_bos: c_bool,
 ) -> int:
@@ -444,8 +446,8 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
-    last_tokens_data: Array[llama_token],
+    candidates,  # type: _Pointer[llama_token_data_array]
+    last_tokens_data,  # type: Array[llama_token]
     last_tokens_size: c_int,
     penalty: c_float,
 ):
@@ -467,8 +469,8 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
-    last_tokens_data: Array[llama_token],
+    candidates,  # type: _Pointer[llama_token_data_array]
+    last_tokens_data,  # type: Array[llama_token]
     last_tokens_size: c_int,
     alpha_frequency: c_float,
     alpha_presence: c_float,
@@ -495,7 +497,9 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]):
+def llama_sample_softmax(
+    ctx: llama_context_p, candidates  # type: _Pointer[llama_token_data]
+):
     return _lib.llama_sample_softmax(ctx, candidates)
 
 
@@ -509,7 +513,7 @@ _lib.llama_sample_softmax.restype = None
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     k: c_int,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -528,7 +532,7 @@ _lib.llama_sample_top_k.restype = None
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -547,7 +551,7 @@ _lib.llama_sample_top_p.restype = None
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     z: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -566,7 +570,7 @@ _lib.llama_sample_tail_free.restype = None
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -583,7 +587,9 @@ _lib.llama_sample_typical.restype = None
 
 
 def llama_sample_temperature(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    temp: c_float,
 ):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
@@ -604,11 +610,11 @@ _lib.llama_sample_temperature.restype = None
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     tau: c_float,
     eta: c_float,
     m: c_int,
-    mu: _Pointer[c_float],
+    mu,  # type: _Pointer[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
@@ -631,10 +637,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     tau: c_float,
     eta: c_float,
-    mu: _Pointer[c_float],
+    mu,  # type: _Pointer[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
@@ -651,7 +657,8 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 # @details Selects the token with the highest probability.
 def llama_sample_token_greedy(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
@@ -665,7 +672,8 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 # @details Randomly selects a token from the candidates based on their probabilities.
 def llama_sample_token(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 

From 690588410ef6f227ba069efe0f81e3b7baeedfe9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 19:30:14 -0400
Subject: [PATCH 65/77] Fix return type

---
 examples/llama_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 62069a471..72bc443e5 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -260,7 +260,7 @@ _lib.llama_get_state_size.restype = c_size_t
 # Returns the number of bytes copied
 def llama_copy_state_data(
     ctx: llama_context_p, dest  # type: Array[c_uint8]
-) -> c_size_t:
+) -> int:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -272,7 +272,7 @@ _lib.llama_copy_state_data.restype = c_size_t
 # Returns the number of bytes read
 def llama_set_state_data(
     ctx: llama_context_p, src  # type: Array[c_uint8]
-) -> c_size_t:
+) -> int:
     return _lib.llama_set_state_data(ctx, src)
 
 

From 3808a73751f11bc92757bba77237f12e8d04b599 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 14 May 2023 22:08:11 -0400
Subject: [PATCH 66/77] Fix obscure Wndows DLL issue. Closes #208

---
 examples/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 72bc443e5..a0261b742 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -52,7 +52,7 @@ def _load_shared_library(lib_base_name: str):
     for _lib_path in _lib_paths:
         if _lib_path.exists():
             try:
-                return ctypes.CDLL(str(_lib_path))
+                return ctypes.CDLL(str(_lib_path), winmode=0)
             except Exception as e:
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 

From 59f80d2a0db7bfa77f97d711f2b47d2706c8681d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 03:04:22 -0400
Subject: [PATCH 67/77] Fix mlock_supported and mmap_supported return type

---
 examples/llama_cpp.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index a0261b742..2eb519380 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -157,7 +157,7 @@ _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
 
-def llama_mmap_supported() -> c_bool:
+def llama_mmap_supported() -> bool:
     return _lib.llama_mmap_supported()
 
 
@@ -165,7 +165,7 @@ _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool
 
 
-def llama_mlock_supported() -> c_bool:
+def llama_mlock_supported() -> bool:
     return _lib.llama_mlock_supported()
 
 
@@ -387,7 +387,9 @@ _lib.llama_n_embd.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
-def llama_get_logits(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
+def llama_get_logits(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_logits(ctx)
 
 
@@ -397,7 +399,9 @@ _lib.llama_get_logits.restype = c_float_p
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
-def llama_get_embeddings(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
+def llama_get_embeddings(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_embeddings(ctx)
 
 

From 7609c73ee6d939006bdd2ddc103975a5344e7216 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 00:12:47 -0400
Subject: [PATCH 68/77] Update llama.cpp (remove min_keep default value)

---
 examples/llama_cpp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 2eb519380..0ea37b6ee 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -519,7 +519,7 @@ def llama_sample_top_k(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     k: c_int,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
@@ -538,7 +538,7 @@ def llama_sample_top_p(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
@@ -557,7 +557,7 @@ def llama_sample_tail_free(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     z: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -576,7 +576,7 @@ def llama_sample_typical(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 

From a83d11750762b9d0b9456400ae04daae7966b270 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 15 May 2023 09:15:01 -0400
Subject: [PATCH 69/77] Add winmode arg only on windows if python version
 supports it

---
 examples/llama_cpp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 0ea37b6ee..3d86a6150 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -44,15 +44,17 @@ def _load_shared_library(lib_base_name: str):
         _base_path = _lib.parent.resolve()
         _lib_paths = [_lib.resolve()]
 
+    cdll_args = dict()  # type: ignore
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(_base_path))
+        cdll_args["winmode"] = 0
 
     # Try to load the shared library, handling potential errors
     for _lib_path in _lib_paths:
         if _lib_path.exists():
             try:
-                return ctypes.CDLL(str(_lib_path), winmode=0)
+                return ctypes.CDLL(str(_lib_path), **cdll_args)
             except Exception as e:
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 

From aae6c03e94d51ccde93d8412c71023ee2462b284 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 14 May 2023 00:04:22 -0400
Subject: [PATCH 70/77] Update llama.cpp

---
 examples/llama_cpp.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 3d86a6150..81435deeb 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -70,7 +70,7 @@ _lib_base_name = "llama"
 _lib = _load_shared_library(_lib_base_name)
 
 # C types
-LLAMA_FILE_VERSION = c_int(1)
+LLAMA_FILE_VERSION = c_int(2)
 LLAMA_FILE_MAGIC = b"ggjt"
 LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
 LLAMA_SESSION_MAGIC = b"ggsn"
@@ -111,6 +111,7 @@ class llama_context_params(Structure):
     _fields_ = [
         ("n_ctx", c_int),  # text context
         ("n_parts", c_int),  # -1 for default
+        ("n_gpu_layers", c_int),  # number of layers to store in VRAM
         ("seed", c_int),  # RNG seed, 0 for random
         ("f16_kv", c_bool),  # use fp16 for KV cache
         (
@@ -137,7 +138,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
-LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
 # LLAMA_FTYPE_MOSTLY_Q4_3 = c_int(6)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
@@ -261,9 +262,9 @@ _lib.llama_get_state_size.restype = c_size_t
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
 def llama_copy_state_data(
-    ctx: llama_context_p, dest  # type: Array[c_uint8]
+    ctx: llama_context_p, dst  # type: Array[c_uint8]
 ) -> int:
-    return _lib.llama_copy_state_data(ctx, dest)
+    return _lib.llama_copy_state_data(ctx, dst)
 
 
 _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]

From 66c27f31204039ade897dead8d23c24d5bda2bc9 Mon Sep 17 00:00:00 2001
From: Aneesh Joy <aneeshjoy@gmail.com>
Date: Wed, 17 May 2023 18:04:58 +0100
Subject: [PATCH 71/77] Fixd CUBLAS dll load issue in Windows

---
 examples/llama_cpp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 81435deeb..3ce1820e2 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -48,6 +48,7 @@ def _load_shared_library(lib_base_name: str):
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(_base_path))
+        os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
         cdll_args["winmode"] = 0
 
     # Try to load the shared library, handling potential errors

From 601b19203f37d67c767f2c9126dffdcdead369ea Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 17 May 2023 15:26:38 -0400
Subject: [PATCH 72/77] Check for CUDA_PATH before adding

---
 examples/llama_cpp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 3ce1820e2..a8f90f861 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -48,7 +48,9 @@ def _load_shared_library(lib_base_name: str):
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(_base_path))
-        os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
         cdll_args["winmode"] = 0
 
     # Try to load the shared library, handling potential errors

From fda33ddbd510485a889cc4b3d43a51fa4438cbca Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 19 May 2023 11:59:33 -0400
Subject: [PATCH 73/77] Fix llama_cpp and Llama type signatures. Closes #221

---
 examples/llama_cpp.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index a8f90f861..6bddadff3 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -206,7 +206,7 @@ _lib.llama_free.restype = None
 # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
 def llama_model_quantize(
     fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
-) -> c_int:
+) -> int:
     return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
 
 
@@ -225,7 +225,7 @@ def llama_apply_lora_from_file(
     path_lora: c_char_p,
     path_base_model: c_char_p,
     n_threads: c_int,
-) -> c_int:
+) -> int:
     return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
 
 
@@ -234,7 +234,7 @@ _lib.llama_apply_lora_from_file.restype = c_int
 
 
 # Returns the number of tokens in the KV cache
-def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
+def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
     return _lib.llama_get_kv_cache_token_count(ctx)
 
 
@@ -253,7 +253,7 @@ _lib.llama_set_rng_seed.restype = None
 
 # Returns the maximum size in bytes of the state (rng, logits, embedding
 # and kv_cache) - will often be smaller after compacting tokens
-def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
+def llama_get_state_size(ctx: llama_context_p) -> int:
     return _lib.llama_get_state_size(ctx)
 
 
@@ -293,7 +293,7 @@ def llama_load_session_file(
     tokens_out,  # type: Array[llama_token]
     n_token_capacity: c_size_t,
     n_token_count_out,  # type: _Pointer[c_size_t]
-) -> c_size_t:
+) -> int:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
     )
@@ -314,7 +314,7 @@ def llama_save_session_file(
     path_session: bytes,
     tokens,  # type: Array[llama_token]
     n_token_count: c_size_t,
-) -> c_size_t:
+) -> int:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
 
 
@@ -337,7 +337,7 @@ def llama_eval(
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
-) -> c_int:
+) -> int:
     return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
 
 
@@ -364,7 +364,7 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int,
 _lib.llama_tokenize.restype = c_int
 
 
-def llama_n_vocab(ctx: llama_context_p) -> c_int:
+def llama_n_vocab(ctx: llama_context_p) -> int:
     return _lib.llama_n_vocab(ctx)
 
 
@@ -372,7 +372,7 @@ _lib.llama_n_vocab.argtypes = [llama_context_p]
 _lib.llama_n_vocab.restype = c_int
 
 
-def llama_n_ctx(ctx: llama_context_p) -> c_int:
+def llama_n_ctx(ctx: llama_context_p) -> int:
     return _lib.llama_n_ctx(ctx)
 
 
@@ -380,7 +380,7 @@ _lib.llama_n_ctx.argtypes = [llama_context_p]
 _lib.llama_n_ctx.restype = c_int
 
 
-def llama_n_embd(ctx: llama_context_p) -> c_int:
+def llama_n_embd(ctx: llama_context_p) -> int:
     return _lib.llama_n_embd(ctx)
 
 
@@ -426,7 +426,7 @@ _lib.llama_token_to_str.restype = c_char_p
 # Special tokens
 
 
-def llama_token_bos() -> llama_token:
+def llama_token_bos() -> int:
     return _lib.llama_token_bos()
 
 
@@ -434,7 +434,7 @@ _lib.llama_token_bos.argtypes = []
 _lib.llama_token_bos.restype = llama_token
 
 
-def llama_token_eos() -> llama_token:
+def llama_token_eos() -> int:
     return _lib.llama_token_eos()
 
 
@@ -442,7 +442,7 @@ _lib.llama_token_eos.argtypes = []
 _lib.llama_token_eos.restype = llama_token
 
 
-def llama_token_nl() -> llama_token:
+def llama_token_nl() -> int:
     return _lib.llama_token_nl()
 
 
@@ -625,7 +625,7 @@ def llama_sample_token_mirostat(
     eta: c_float,
     m: c_int,
     mu,  # type: _Pointer[c_float]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
 
@@ -651,7 +651,7 @@ def llama_sample_token_mirostat_v2(
     tau: c_float,
     eta: c_float,
     mu,  # type: _Pointer[c_float]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
 
@@ -669,7 +669,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 def llama_sample_token_greedy(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
 
@@ -684,7 +684,7 @@ _lib.llama_sample_token_greedy.restype = llama_token
 def llama_sample_token(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token(ctx, candidates)
 
 

From 60a7c76339c8f2866bba17a07a4014cd98be60ce Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 21 May 2023 17:47:21 -0400
Subject: [PATCH 74/77] Update llama.cpp

---
 examples/llama_cpp.py | 219 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 178 insertions(+), 41 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 6bddadff3..7c27e3948 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -72,31 +72,61 @@ _lib_base_name = "llama"
 # Load the library
 _lib = _load_shared_library(_lib_base_name)
 
-# C types
-LLAMA_FILE_VERSION = c_int(2)
-LLAMA_FILE_MAGIC = b"ggjt"
-LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
-LLAMA_SESSION_MAGIC = b"ggsn"
+# Misc
+c_float_p = POINTER(c_float)
+c_uint8_p = POINTER(c_uint8)
+c_size_t_p = POINTER(c_size_t)
+
+# llama.h bindings
+
+# #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
+# #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
+# #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
+# #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
+# #define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
+
+# #define LLAMA_FILE_VERSION           3
+LLAMA_FILE_VERSION = c_int(3)
+LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
+LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
+LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_VERSION = c_int(1)
 
+# struct llama_context;
 llama_context_p = c_void_p
 
 
+# typedef int llama_token;
 llama_token = c_int
 llama_token_p = POINTER(llama_token)
 
 
+# typedef struct llama_token_data {
+#     llama_token id; // token id
+#     float logit;    // log-odds of the token
+#     float p;        // probability of the token
+# } llama_token_data;
 class llama_token_data(Structure):
     _fields_ = [
-        ("id", llama_token),  # token id
-        ("logit", c_float),  # log-odds of the token
-        ("p", c_float),  # probability of the token
+        ("id", llama_token),
+        ("logit", c_float),
+        ("p", c_float),
     ]
 
 
 llama_token_data_p = POINTER(llama_token_data)
 
 
+# typedef struct llama_token_data_array {
+#     llama_token_data * data;
+#     size_t size;
+#     bool sorted;
+# } llama_token_data_array;
 class llama_token_data_array(Structure):
     _fields_ = [
         ("data", llama_token_data_p),
@@ -107,54 +137,72 @@ class llama_token_data_array(Structure):
 
 llama_token_data_array_p = POINTER(llama_token_data_array)
 
+# typedef void (*llama_progress_callback)(float progress, void *ctx);
 llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 
+# struct llama_context_params {
+#     int n_ctx;        // text context
+#     int n_gpu_layers; // number of layers to store in VRAM
+#     int seed;         // RNG seed, -1 for random
+
+#     bool f16_kv;     // use fp16 for KV cache
+#     bool logits_all; // the llama_eval() call computes all logits, not just the last one
+#     bool vocab_only; // only load the vocabulary, no weights
+#     bool use_mmap;   // use mmap if possible
+#     bool use_mlock;  // force system to keep model in RAM
+#     bool embedding;  // embedding mode only
+
+
+#     // called with a progress value between 0 and 1, pass NULL to disable
+#     llama_progress_callback progress_callback;
+#     // context pointer passed to the progress callback
+#     void * progress_callback_user_data;
+# };
 class llama_context_params(Structure):
     _fields_ = [
-        ("n_ctx", c_int),  # text context
-        ("n_parts", c_int),  # -1 for default
-        ("n_gpu_layers", c_int),  # number of layers to store in VRAM
-        ("seed", c_int),  # RNG seed, 0 for random
-        ("f16_kv", c_bool),  # use fp16 for KV cache
+        ("n_ctx", c_int),
+        ("n_gpu_layers", c_int),
+        ("seed", c_int),
+        ("f16_kv", c_bool),
         (
             "logits_all",
             c_bool,
-        ),  # the llama_eval() call computes all logits, not just the last one
-        ("vocab_only", c_bool),  # only load the vocabulary, no weights
-        ("use_mmap", c_bool),  # use mmap if possible
-        ("use_mlock", c_bool),  # force system to keep model in RAM
-        ("embedding", c_bool),  # embedding mode only
-        # called with a progress value between 0 and 1, pass NULL to disable
+        ),
+        ("vocab_only", c_bool),
+        ("use_mmap", c_bool),
+        ("use_mlock", c_bool),
+        ("embedding", c_bool),
         ("progress_callback", llama_progress_callback),
-        # context pointer passed to the progress callback
         ("progress_callback_user_data", c_void_p),
     ]
 
 
 llama_context_params_p = POINTER(llama_context_params)
 
+# enum llama_ftype {
+#     LLAMA_FTYPE_ALL_F32              = 0,
+#     LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+#     // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+#     // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+#     LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+# };
 LLAMA_FTYPE_ALL_F32 = c_int(0)
-LLAMA_FTYPE_MOSTLY_F16 = c_int(1)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
-    4
-)  # tok_embeddings.weight and output.weight are F16
-# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_3 = c_int(6)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)  # except 1d tensors
-
-# Misc
-c_float_p = POINTER(c_float)
-c_uint8_p = POINTER(c_uint8)
-c_size_t_p = POINTER(c_size_t)
-
-# Functions
+LLAMA_FTYPE_MOSTLY_F16 = c_int(1)
+LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)
+LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
+LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)
+LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)
+LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)
 
 
+# LLAMA_API struct llama_context_params llama_context_default_params();
 def llama_context_default_params() -> llama_context_params:
     return _lib.llama_context_default_params()
 
@@ -163,6 +211,7 @@ _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
 
+# LLAMA_API bool llama_mmap_supported();
 def llama_mmap_supported() -> bool:
     return _lib.llama_mmap_supported()
 
@@ -171,6 +220,7 @@ _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool
 
 
+# LLAMA_API bool llama_mlock_supported();
 def llama_mlock_supported() -> bool:
     return _lib.llama_mlock_supported()
 
@@ -179,9 +229,33 @@ _lib.llama_mlock_supported.argtypes = []
 _lib.llama_mlock_supported.restype = c_bool
 
 
-# Various functions for loading a ggml llama model.
-# Allocate (almost) all memory needed for the model.
-# Return NULL on failure
+# // TODO: not great API - very likely to change
+# // Initialize the llama + ggml backend
+# // Call once at the start of the program
+# LLAMA_API void llama_init_backend();
+def llama_init_backend():
+    return _lib.llama_init_backend()
+
+
+_lib.llama_init_backend.argtypes = []
+_lib.llama_init_backend.restype = None
+
+
+# LLAMA_API int64_t llama_time_us();
+def llama_time_us() -> int:
+    return _lib.llama_time_us()
+
+
+_lib.llama_time_us.argtypes = []
+_lib.llama_time_us.restype = ctypes.c_int64
+
+
+# // Various functions for loading a ggml llama model.
+# // Allocate (almost) all memory needed for the model.
+# // Return NULL on failure
+# LLAMA_API struct llama_context * llama_init_from_file(
+#                             const char * path_model,
+#         struct llama_context_params   params);
 def llama_init_from_file(
     path_model: bytes, params: llama_context_params
 ) -> llama_context_p:
@@ -193,6 +267,7 @@ _lib.llama_init_from_file.restype = llama_context_p
 
 
 # Frees all allocated memory
+# LLAMA_API void llama_free(struct llama_context * ctx);
 def llama_free(ctx: llama_context_p):
     return _lib.llama_free(ctx)
 
@@ -204,6 +279,11 @@ _lib.llama_free.restype = None
 # TODO: not great API - very likely to change
 # Returns 0 on success
 # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
+# LLAMA_API int llama_model_quantize(
+#         const char * fname_inp,
+#         const char * fname_out,
+#     enum llama_ftype   ftype,
+#         int          nthread);
 def llama_model_quantize(
     fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
 ) -> int:
@@ -220,6 +300,11 @@ _lib.llama_model_quantize.restype = c_int
 # The model needs to be reloaded before applying a new adapter, otherwise the adapter
 # will be applied on top of the previous one
 # Returns 0 on success
+# LLAMA_API int llama_apply_lora_from_file(
+#         struct llama_context * ctx,
+#                   const char * path_lora,
+#                   const char * path_base_model,
+#                          int   n_threads);
 def llama_apply_lora_from_file(
     ctx: llama_context_p,
     path_lora: c_char_p,
@@ -234,6 +319,7 @@ _lib.llama_apply_lora_from_file.restype = c_int
 
 
 # Returns the number of tokens in the KV cache
+# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
 def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
     return _lib.llama_get_kv_cache_token_count(ctx)
 
@@ -243,6 +329,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int
 
 
 # Sets the current rng seed.
+# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
 def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
     return _lib.llama_set_rng_seed(ctx, seed)
 
@@ -253,6 +340,7 @@ _lib.llama_set_rng_seed.restype = None
 
 # Returns the maximum size in bytes of the state (rng, logits, embedding
 # and kv_cache) - will often be smaller after compacting tokens
+# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
 def llama_get_state_size(ctx: llama_context_p) -> int:
     return _lib.llama_get_state_size(ctx)
 
@@ -264,6 +352,7 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
+# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
 def llama_copy_state_data(
     ctx: llama_context_p, dst  # type: Array[c_uint8]
 ) -> int:
@@ -276,6 +365,7 @@ _lib.llama_copy_state_data.restype = c_size_t
 
 # Set the state reading from the specified address
 # Returns the number of bytes read
+# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
 def llama_set_state_data(
     ctx: llama_context_p, src  # type: Array[c_uint8]
 ) -> int:
@@ -287,6 +377,7 @@ _lib.llama_set_state_data.restype = c_size_t
 
 
 # Save/load session file
+# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
@@ -309,6 +400,7 @@ _lib.llama_load_session_file.argtypes = [
 _lib.llama_load_session_file.restype = c_size_t
 
 
+# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
 def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
@@ -331,6 +423,12 @@ _lib.llama_save_session_file.restype = c_size_t
 # tokens + n_tokens is the provided batch of new tokens to process
 # n_past is the number of tokens to use from previous eval calls
 # Returns 0 on success
+# LLAMA_API int llama_eval(
+#         struct llama_context * ctx,
+#            const llama_token * tokens,
+#                          int   n_tokens,
+#                          int   n_past,
+#                          int   n_threads);
 def llama_eval(
     ctx: llama_context_p,
     tokens,  # type: Array[llama_token]
@@ -350,6 +448,12 @@ _lib.llama_eval.restype = c_int
 # Returns the number of tokens on success, no more than n_max_tokens
 # Returns a negative number on failure - the number of tokens that would have been returned
 # TODO: not sure if correct
+# LLAMA_API int llama_tokenize(
+#         struct llama_context * ctx,
+#                   const char * text,
+#                  llama_token * tokens,
+#                          int   n_max_tokens,
+#                         bool   add_bos);
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
@@ -364,6 +468,7 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int,
 _lib.llama_tokenize.restype = c_int
 
 
+# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
 def llama_n_vocab(ctx: llama_context_p) -> int:
     return _lib.llama_n_vocab(ctx)
 
@@ -372,6 +477,7 @@ _lib.llama_n_vocab.argtypes = [llama_context_p]
 _lib.llama_n_vocab.restype = c_int
 
 
+# LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
 def llama_n_ctx(ctx: llama_context_p) -> int:
     return _lib.llama_n_ctx(ctx)
 
@@ -380,6 +486,7 @@ _lib.llama_n_ctx.argtypes = [llama_context_p]
 _lib.llama_n_ctx.restype = c_int
 
 
+# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 def llama_n_embd(ctx: llama_context_p) -> int:
     return _lib.llama_n_embd(ctx)
 
@@ -393,6 +500,7 @@ _lib.llama_n_embd.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
+# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 def llama_get_logits(
     ctx: llama_context_p,
 ):  # type: (...) -> Array[float] # type: ignore
@@ -405,6 +513,7 @@ _lib.llama_get_logits.restype = c_float_p
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
+# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 def llama_get_embeddings(
     ctx: llama_context_p,
 ):  # type: (...) -> Array[float] # type: ignore
@@ -416,6 +525,7 @@ _lib.llama_get_embeddings.restype = c_float_p
 
 
 # Token Id -> String. Uses the vocabulary in the provided context
+# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
     return _lib.llama_token_to_str(ctx, token)
 
@@ -426,6 +536,7 @@ _lib.llama_token_to_str.restype = c_char_p
 # Special tokens
 
 
+# LLAMA_API llama_token llama_token_bos();
 def llama_token_bos() -> int:
     return _lib.llama_token_bos()
 
@@ -434,6 +545,7 @@ _lib.llama_token_bos.argtypes = []
 _lib.llama_token_bos.restype = llama_token
 
 
+# LLAMA_API llama_token llama_token_eos();
 def llama_token_eos() -> int:
     return _lib.llama_token_eos()
 
@@ -442,6 +554,7 @@ _lib.llama_token_eos.argtypes = []
 _lib.llama_token_eos.restype = llama_token
 
 
+# LLAMA_API llama_token llama_token_nl();
 def llama_token_nl() -> int:
     return _lib.llama_token_nl()
 
@@ -454,6 +567,7 @@ _lib.llama_token_nl.restype = llama_token
 
 
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -477,6 +591,7 @@ _lib.llama_sample_repetition_penalty.restype = None
 
 
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -507,6 +622,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_softmax(
     ctx: llama_context_p, candidates  # type: _Pointer[llama_token_data]
 ):
@@ -521,6 +637,7 @@ _lib.llama_sample_softmax.restype = None
 
 
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
 def llama_sample_top_k(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -540,6 +657,7 @@ _lib.llama_sample_top_k.restype = None
 
 
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
 def llama_sample_top_p(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -559,6 +677,7 @@ _lib.llama_sample_top_p.restype = None
 
 
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
 def llama_sample_tail_free(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -578,6 +697,7 @@ _lib.llama_sample_tail_free.restype = None
 
 
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
 def llama_sample_typical(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -596,6 +716,7 @@ _lib.llama_sample_typical.argtypes = [
 _lib.llama_sample_typical.restype = None
 
 
+# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
 def llama_sample_temperature(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -618,6 +739,7 @@ _lib.llama_sample_temperature.restype = None
 # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
 # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -645,6 +767,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
 # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -666,6 +789,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 
 # @details Selects the token with the highest probability.
+# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_token_greedy(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -681,6 +805,7 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 
 # @details Randomly selects a token from the candidates based on their probabilities.
+# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_token(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -698,6 +823,7 @@ _lib.llama_sample_token.restype = llama_token
 # Performance information
 
 
+# LLAMA_API void llama_print_timings(struct llama_context * ctx);
 def llama_print_timings(ctx: llama_context_p):
     _lib.llama_print_timings(ctx)
 
@@ -706,6 +832,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p]
 _lib.llama_print_timings.restype = None
 
 
+# LLAMA_API void llama_reset_timings(struct llama_context * ctx);
 def llama_reset_timings(ctx: llama_context_p):
     _lib.llama_reset_timings(ctx)
 
@@ -715,9 +842,19 @@ _lib.llama_reset_timings.restype = None
 
 
 # Print system information
+# LLAMA_API const char * llama_print_system_info(void);
 def llama_print_system_info() -> bytes:
     return _lib.llama_print_system_info()
 
 
 _lib.llama_print_system_info.argtypes = []
 _lib.llama_print_system_info.restype = c_char_p
+
+###################################################################################################
+
+
+_llama_initialized = False
+
+if not _llama_initialized:
+    llama_init_backend()
+    _llama_initialized = True

From 4ad62c489d76b633480d8bfd1d1d2e974db67f1b Mon Sep 17 00:00:00 2001
From: Don Mahurin <@>
Date: Mon, 22 May 2023 23:54:57 -0700
Subject: [PATCH 75/77] fix "missing 1 required positional argument:
 'min_keep'"

---
 examples/low_level_api_chat_cpp.py  | 8 ++++----
 examples/low_level_api_llama_cpp.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py
index e67cd8e43..5e8172434 100644
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@@ -395,10 +395,10 @@ n_keep = {self.params.n_keep}
                         id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
                     else:
                         # Temperature sampling
-                        llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
-                        llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
-                        llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
-                        llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
+                        llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
+                        llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z),min_keep=llama_cpp.c_size_t(1))
+                        llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p),min_keep=llama_cpp.c_size_t(1))
+                        llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p),min_keep=llama_cpp.c_size_t(1))
                         llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
                         id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
                 # print("`{}`".format(candidates_p.size))
diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py
index 9e38ec7cb..2d1bab3f8 100644
--- a/examples/low_level_api_llama_cpp.py
+++ b/examples/low_level_api_llama_cpp.py
@@ -68,8 +68,8 @@ while remaining_tokens > 0:
             _arr,
             last_n_repeat, frequency_penalty, presence_penalty)
 
-        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40)
-        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8)
+        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40, min_keep=llama_cpp.c_size_t(1))
+        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, min_keep=llama_cpp.c_size_t(1))
         llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
         id = llama_cpp.llama_sample_token(ctx, candidates_p)
 

From e5dad2afa06f702bc926675db65ad87426ed736b Mon Sep 17 00:00:00 2001
From: Don Mahurin <@>
Date: Tue, 23 May 2023 06:21:31 -0700
Subject: [PATCH 76/77] Look for libllama in parent directory

---
 examples/llama_cpp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 7c27e3948..643c94bf5 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -31,10 +31,12 @@ def _load_shared_library(lib_base_name: str):
 
     # Construct the paths to the possible shared library names
     _base_path = pathlib.Path(__file__).parent.resolve()
+    _base_path_parent = pathlib.Path(__file__).parent.parent.resolve()
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths = [
         _base_path / f"lib{lib_base_name}{lib_ext}",
+        _base_path_parent / f"lib{lib_base_name}{lib_ext}",
         _base_path / f"{lib_base_name}{lib_ext}",
     ]
 

From 93278f84cf06b447cc4964e5d4435bf51af174f5 Mon Sep 17 00:00:00 2001
From: Don Mahurin <@>
Date: Tue, 23 May 2023 06:21:31 -0700
Subject: [PATCH 77/77] low_level_api_chat_cpp.py: fix default path_prefix arg
 value to match class default value

---
 examples/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/common.py b/examples/common.py
index b51c28b16..2a14917c5 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -108,7 +108,7 @@ def gpt_params_parse(argv = None):
     parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
     parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
     parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
-    parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
+    parser.add_argument("--session", type=str, default="", help="file to cache model state in (may be large!)",dest="path_session")
     parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
     parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
     parser.add_argument(