Merge 93278f84cf into e6e6583199

2025-02-09 02:19:48 +05:30 · 2025-02-09 02:19:48 +05:30 · 495c32cbe4
commit 495c32cbe4
parent e6e6583199 93278f84cf
8 changed files with 1943 additions and 0 deletions
--- a/examples/Chat.py
+++ b/examples/Chat.py
@ -0,0 +1,71 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+    if (env in os.environ):
+        return os.environ[env]
+    return default
+
+AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "USER")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
+N_THREAD = int(env_or_def("N_THREAD", "8"))
+
+today = datetime.datetime.today()
+DATE_YEAR=today.strftime("%Y")
+DATE_TIME=today.strftime("%H:%M")
+
+prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+{USER_NAME}: Hello, {AI_NAME}!
+{AI_NAME}: Hello {USER_NAME}! How may I help you today?
+{USER_NAME}: What year is it?
+{AI_NAME}: We are in {DATE_YEAR}.
+{USER_NAME}: Please tell me the largest city in Europe.
+{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
+{USER_NAME}: What can you tell me about Moscow?
+{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
+{USER_NAME}: What is a cat?
+{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{USER_NAME}: How do I pass command line arguments to a Node.js program?
+{AI_NAME}: The arguments are stored in process.argv.
+
+    argv[0] is the path to the Node. js executable.
+    argv[1] is the path to the script file.
+    argv[2] is the first argument passed to the script.
+    argv[3] is the second argument passed to the script and so on.
+{USER_NAME}: Name a color.
+{AI_NAME}: Blue.
+{USER_NAME}: What time is it?
+{AI_NAME}: It is {DATE_TIME}.
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+    n_ctx=2048,
+    temp=0.7,
+    top_k=40,
+    top_p=0.5,
+    repeat_last_n=256,
+    n_batch=1024,
+    repeat_penalty=1.17647,
+    model=MODEL,
+    n_threads=N_THREAD,
+    n_predict=N_PREDICTS,
+    use_color=True,
+    interactive=True,
+    antiprompt=[f"{USER_NAME}:"],
+    input_prefix=" ",
+    input_suffix=f"{AI_NAME}:",
+    prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+    m.interact()
--- a/examples/Miku.py
+++ b/examples/Miku.py
@ -0,0 +1,59 @@
+#!/bin/python
+import sys, os
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+    if (env in os.environ):
+        return os.environ[env]
+    return default
+
+AI_NAME = env_or_def("AI_NAME", "Miku")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "Anon")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
+N_THREAD = int(env_or_def("N_THREAD", "0"))
+
+prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
+{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
+{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
+{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
+The conversation is only between {USER_NAME} and {AI_NAME}
+The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
+{AI_NAME} can only communicate through text, so she can't send images or videos.
+
+
+{USER_NAME}: Hello!
+{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
+{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
+{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
+{AI_NAME}: What do you like to do in your free time? ^_^
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+    n_batch=1024,
+    n_ctx=2048,
+    n_keep=-1,
+    repeat_last_n=256,
+    repeat_penalty=1.17647,
+    temp=0.7,
+    top_k=40,
+    top_p=0.5,
+    model=MODEL,
+    n_predict=N_PREDICTS,
+    use_color=True,
+    interactive=True,
+    antiprompt=[f"{USER_NAME}:"],
+    prompt=prompt,
+)
+
+if N_THREAD > 0:
+    params.n_threads = N_THREAD
+
+with LLaMAInteract(params) as m:
+    m.interact()
--- a/examples/ReasonAct.py
+++ b/examples/ReasonAct.py
@ -0,0 +1,49 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+    if (env in os.environ):
+        return os.environ[env]
+    return default
+
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+
+prompt=f"""You run in a loop of Thought, Action, Observation.
+At the end of the loop either Answer or restate your Thought and Action.
+Use Thought to describe your thoughts about the question you have been asked.
+Use Action to run one of these actions available to you:
+- calculate[python math expression]
+Observation will be the result of running those actions
+
+
+Question: What is 4 * 7 / 3?
+Thought: Do I need to use an action? Yes, I use calculate to do math
+Action: calculate[4 * 7 / 3]
+Observation: 9.3333333333
+Thought: Do I need to use an action? No, have the result
+Answer: The calculate tool says it is 9.3333333333
+Question: What is capital of france?
+Thought: Do I need to use an action? No, I know the answer
+Answer: Paris is the capital of France
+Question:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+    interactive=True,
+    interactive_start=True,
+    top_k=10000,
+    temp=0.2,
+    repeat_penalty=1,
+    n_threads=7,
+    n_ctx=2048,
+    antiprompt=["Question:","Observation:"],
+    model=MODEL,
+    input_prefix=" ",
+    n_predict=-1,
+    prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+    m.interact()
--- a/examples/common.py
+++ b/examples/common.py
@ -0,0 +1,202 @@
+import os
+import argparse
+import re
+
+from dataclasses import dataclass, field
+from typing import List
+
+# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
+
+
+@dataclass
+class GptParams:
+    seed: int = -1
+    n_threads: int = min(4, os.cpu_count() or 1)
+    n_predict: int = 128
+    n_parts: int = -1
+    n_ctx: int = 512
+    n_batch: int = 8
+    n_keep: int = 0
+
+    ignore_eos: bool = False
+    logit_bias: dict[int, float] = field(default_factory=dict)
+    top_k: int = 40
+    top_p: float = 0.95
+    tfs_z: float = 1.00
+    typical_p: float = 1.00
+    temp: float = 0.80
+    repeat_penalty: float = 1.10
+    repeat_last_n: int = 64
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    mirostat: int = 0
+    mirostat_tau: float = 5.0
+    mirostat_eta: float = 0.1
+
+    model: str = "./models/llama-7B/ggml-model.bin"
+    prompt: str = ""
+    path_session: str = ""
+    input_prefix: str = " "
+    input_suffix: str = ""
+    antiprompt: List[str] = field(default_factory=list)
+
+    lora_adapter: str = ""
+    lora_base: str = ""
+
+    memory_f16: bool = True
+    random_prompt: bool = False
+    use_color: bool = False
+    interactive: bool = False
+
+    embedding: bool = False
+    interactive_start: bool = False
+
+    instruct: bool = False
+    penalize_nl: bool = True
+    perplexity: bool = False
+    use_mmap: bool = True
+    use_mlock: bool = False
+    mem_test: bool = False
+    verbose_prompt: bool = False
+
+    file: str = None
+
+    # If chat ended prematurely, append this to the conversation to fix it.
+    # Set to "\nUser:" etc.
+    # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
+    fix_prefix: str = ""
+    input_echo: bool = True,
+
+    # Default instructions for Alpaca
+    # switch to "Human" and "Assistant" for Vicuna.
+    # TODO: TBD how they are gonna handle this upstream
+    instruct_inp_prefix: str="\n\n### Instruction:\n\n"
+    instruct_inp_suffix: str="\n\n### Response:\n\n"
+
+
+def gpt_params_parse(argv = None):
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
+    parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
+    parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
+    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
+    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
+    parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
+
+    parser.add_argument(
+        "-l",
+        "--logit-bias",
+        type=str,
+        action='append',
+        help="--logit-bias TOKEN_ID(+/-)BIAS",
+        dest="logit_bias_str"
+    )
+    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
+    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
+    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
+    parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
+    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
+    parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
+    parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
+    parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau")
+    parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
+
+    parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("--session", type=str, default="", help="file to cache model state in (may be large!)",dest="path_session")
+    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
+    parser.add_argument(
+        "-r",
+        "--reverse-prompt",
+        type=str,
+        action='append',
+        help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
+        dest="antiprompt"
+    )
+
+    parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter")
+    parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base")
+
+    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
+    parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
+    parser.add_argument(
+        "--color",
+        action="store_true",
+        help="colorise output to distinguish prompt and user input from generations",
+        dest="use_color"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
+    )
+
+    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
+    parser.add_argument(
+        "--interactive-first",
+        action="store_true",
+        help="run in interactive mode and wait for input right away",
+        dest="interactive_start"
+    )
+
+    parser.add_argument(
+        "-ins",
+        "--instruct",
+        action="store_true",
+        help="run in instruction mode (use with Alpaca or Vicuna models)",
+        dest="instruct"
+    )
+    parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl")
+    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
+    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
+    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
+    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
+
+    #Custom args
+    parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
+    parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
+
+    parser.add_argument(
+        "--interactive-start",
+        action="store_true",
+        help="run in interactive mode",
+        dest="interactive"
+    )
+
+    args = parser.parse_args(argv)
+
+    logit_bias_str = args.logit_bias_str
+    delattr(args, "logit_bias_str")
+    params = GptParams(**vars(args))
+
+    if (params.lora_adapter):
+        params.use_mmap = False
+
+    if (logit_bias_str != None):
+        for i in logit_bias_str:
+            if (m := re.match(r"(\d+)([-+]\d+)", i)):
+                params.logit_bias[int(m.group(1))] = float(m.group(2))
+
+    return params
+
+def gpt_random_prompt(rng):
+    return [
+        "So",
+        "Once upon a time",
+        "When",
+        "The",
+        "After",
+        "If",
+        "import",
+        "He",
+        "She",
+        "They",
+    ][rng % 10]
+
+if __name__ == "__main__":
+    print(gpt_params_parse())
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@ -0,0 +1,862 @@
+import sys
+import os
+import ctypes
+from ctypes import (
+    c_int,
+    c_float,
+    c_char_p,
+    c_void_p,
+    c_bool,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+    Array,
+    c_uint8,
+    c_size_t,
+)
+import pathlib
+
+
+# Load the library
+def _load_shared_library(lib_base_name: str):
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux"):
+        lib_ext = ".so"
+    elif sys.platform == "darwin":
+        lib_ext = ".dylib"
+    elif sys.platform == "win32":
+        lib_ext = ".dll"
+    else:
+        raise RuntimeError("Unsupported platform")
+
+    # Construct the paths to the possible shared library names
+    _base_path = pathlib.Path(__file__).parent.resolve()
+    _base_path_parent = pathlib.Path(__file__).parent.parent.resolve()
+    # Searching for the library in the current directory under the name "libllama" (default name
+    # for llamacpp) and "llama" (default name for this repo)
+    _lib_paths = [
+        _base_path / f"lib{lib_base_name}{lib_ext}",
+        _base_path_parent / f"lib{lib_base_name}{lib_ext}",
+        _base_path / f"{lib_base_name}{lib_ext}",
+    ]
+
+    if "LLAMA_CPP_LIB" in os.environ:
+        lib_base_name = os.environ["LLAMA_CPP_LIB"]
+        _lib = pathlib.Path(lib_base_name)
+        _base_path = _lib.parent.resolve()
+        _lib_paths = [_lib.resolve()]
+
+    cdll_args = dict()  # type: ignore
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(_base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        cdll_args["winmode"] = 0
+
+    # Try to load the shared library, handling potential errors
+    for _lib_path in _lib_paths:
+        if _lib_path.exists():
+            try:
+                return ctypes.CDLL(str(_lib_path), **cdll_args)
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
+
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
+
+# Specify the base name of the shared library to load
+_lib_base_name = "llama"
+
+# Load the library
+_lib = _load_shared_library(_lib_base_name)
+
+# Misc
+c_float_p = POINTER(c_float)
+c_uint8_p = POINTER(c_uint8)
+c_size_t_p = POINTER(c_size_t)
+
+# llama.h bindings
+
+# #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
+# #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
+# #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
+# #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
+# #define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
+
+# #define LLAMA_FILE_VERSION           3
+LLAMA_FILE_VERSION = c_int(3)
+LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
+LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
+LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
+LLAMA_SESSION_VERSION = c_int(1)
+
+# struct llama_context;
+llama_context_p = c_void_p
+
+
+# typedef int llama_token;
+llama_token = c_int
+llama_token_p = POINTER(llama_token)
+
+
+# typedef struct llama_token_data {
+#     llama_token id; // token id
+#     float logit;    // log-odds of the token
+#     float p;        // probability of the token
+# } llama_token_data;
+class llama_token_data(Structure):
+    _fields_ = [
+        ("id", llama_token),
+        ("logit", c_float),
+        ("p", c_float),
+    ]
+
+
+llama_token_data_p = POINTER(llama_token_data)
+
+
+# typedef struct llama_token_data_array {
+#     llama_token_data * data;
+#     size_t size;
+#     bool sorted;
+# } llama_token_data_array;
+class llama_token_data_array(Structure):
+    _fields_ = [
+        ("data", llama_token_data_p),
+        ("size", c_size_t),
+        ("sorted", c_bool),
+    ]
+
+
+llama_token_data_array_p = POINTER(llama_token_data_array)
+
+# typedef void (*llama_progress_callback)(float progress, void *ctx);
+llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
+
+
+# struct llama_context_params {
+#     int n_ctx;        // text context
+#     int n_gpu_layers; // number of layers to store in VRAM
+#     int seed;         // RNG seed, -1 for random
+
+#     bool f16_kv;     // use fp16 for KV cache
+#     bool logits_all; // the llama_eval() call computes all logits, not just the last one
+#     bool vocab_only; // only load the vocabulary, no weights
+#     bool use_mmap;   // use mmap if possible
+#     bool use_mlock;  // force system to keep model in RAM
+#     bool embedding;  // embedding mode only
+
+
+#     // called with a progress value between 0 and 1, pass NULL to disable
+#     llama_progress_callback progress_callback;
+#     // context pointer passed to the progress callback
+#     void * progress_callback_user_data;
+# };
+class llama_context_params(Structure):
+    _fields_ = [
+        ("n_ctx", c_int),
+        ("n_gpu_layers", c_int),
+        ("seed", c_int),
+        ("f16_kv", c_bool),
+        (
+            "logits_all",
+            c_bool,
+        ),
+        ("vocab_only", c_bool),
+        ("use_mmap", c_bool),
+        ("use_mlock", c_bool),
+        ("embedding", c_bool),
+        ("progress_callback", llama_progress_callback),
+        ("progress_callback_user_data", c_void_p),
+    ]
+
+
+llama_context_params_p = POINTER(llama_context_params)
+
+# enum llama_ftype {
+#     LLAMA_FTYPE_ALL_F32              = 0,
+#     LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+#     // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+#     // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+#     LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+# };
+LLAMA_FTYPE_ALL_F32 = c_int(0)
+LLAMA_FTYPE_MOSTLY_F16 = c_int(1)
+LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)
+LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
+LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)
+LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)
+LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)
+
+
+# LLAMA_API struct llama_context_params llama_context_default_params();
+def llama_context_default_params() -> llama_context_params:
+    return _lib.llama_context_default_params()
+
+
+_lib.llama_context_default_params.argtypes = []
+_lib.llama_context_default_params.restype = llama_context_params
+
+
+# LLAMA_API bool llama_mmap_supported();
+def llama_mmap_supported() -> bool:
+    return _lib.llama_mmap_supported()
+
+
+_lib.llama_mmap_supported.argtypes = []
+_lib.llama_mmap_supported.restype = c_bool
+
+
+# LLAMA_API bool llama_mlock_supported();
+def llama_mlock_supported() -> bool:
+    return _lib.llama_mlock_supported()
+
+
+_lib.llama_mlock_supported.argtypes = []
+_lib.llama_mlock_supported.restype = c_bool
+
+
+# // TODO: not great API - very likely to change
+# // Initialize the llama + ggml backend
+# // Call once at the start of the program
+# LLAMA_API void llama_init_backend();
+def llama_init_backend():
+    return _lib.llama_init_backend()
+
+
+_lib.llama_init_backend.argtypes = []
+_lib.llama_init_backend.restype = None
+
+
+# LLAMA_API int64_t llama_time_us();
+def llama_time_us() -> int:
+    return _lib.llama_time_us()
+
+
+_lib.llama_time_us.argtypes = []
+_lib.llama_time_us.restype = ctypes.c_int64
+
+
+# // Various functions for loading a ggml llama model.
+# // Allocate (almost) all memory needed for the model.
+# // Return NULL on failure
+# LLAMA_API struct llama_context * llama_init_from_file(
+#                             const char * path_model,
+#         struct llama_context_params   params);
+def llama_init_from_file(
+    path_model: bytes, params: llama_context_params
+) -> llama_context_p:
+    return _lib.llama_init_from_file(path_model, params)
+
+
+_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
+_lib.llama_init_from_file.restype = llama_context_p
+
+
+# Frees all allocated memory
+# LLAMA_API void llama_free(struct llama_context * ctx);
+def llama_free(ctx: llama_context_p):
+    return _lib.llama_free(ctx)
+
+
+_lib.llama_free.argtypes = [llama_context_p]
+_lib.llama_free.restype = None
+
+
+# TODO: not great API - very likely to change
+# Returns 0 on success
+# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
+# LLAMA_API int llama_model_quantize(
+#         const char * fname_inp,
+#         const char * fname_out,
+#     enum llama_ftype   ftype,
+#         int          nthread);
+def llama_model_quantize(
+    fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
+) -> int:
+    return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
+
+
+_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
+_lib.llama_model_quantize.restype = c_int
+
+
+# Apply a LoRA adapter to a loaded model
+# path_base_model is the path to a higher quality model to use as a base for
+# the layers modified by the adapter. Can be NULL to use the current loaded model.
+# The model needs to be reloaded before applying a new adapter, otherwise the adapter
+# will be applied on top of the previous one
+# Returns 0 on success
+# LLAMA_API int llama_apply_lora_from_file(
+#         struct llama_context * ctx,
+#                   const char * path_lora,
+#                   const char * path_base_model,
+#                          int   n_threads);
+def llama_apply_lora_from_file(
+    ctx: llama_context_p,
+    path_lora: c_char_p,
+    path_base_model: c_char_p,
+    n_threads: c_int,
+) -> int:
+    return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
+
+
+_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
+_lib.llama_apply_lora_from_file.restype = c_int
+
+
+# Returns the number of tokens in the KV cache
+# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
+    return _lib.llama_get_kv_cache_token_count(ctx)
+
+
+_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
+_lib.llama_get_kv_cache_token_count.restype = c_int
+
+
+# Sets the current rng seed.
+# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
+def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
+    return _lib.llama_set_rng_seed(ctx, seed)
+
+
+_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
+_lib.llama_set_rng_seed.restype = None
+
+
+# Returns the maximum size in bytes of the state (rng, logits, embedding
+# and kv_cache) - will often be smaller after compacting tokens
+# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
+def llama_get_state_size(ctx: llama_context_p) -> int:
+    return _lib.llama_get_state_size(ctx)
+
+
+_lib.llama_get_state_size.argtypes = [llama_context_p]
+_lib.llama_get_state_size.restype = c_size_t
+
+
+# Copies the state to the specified destination address.
+# Destination needs to have allocated enough memory.
+# Returns the number of bytes copied
+# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
+def llama_copy_state_data(
+    ctx: llama_context_p, dst  # type: Array[c_uint8]
+) -> int:
+    return _lib.llama_copy_state_data(ctx, dst)
+
+
+_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
+_lib.llama_copy_state_data.restype = c_size_t
+
+
+# Set the state reading from the specified address
+# Returns the number of bytes read
+# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
+def llama_set_state_data(
+    ctx: llama_context_p, src  # type: Array[c_uint8]
+) -> int:
+    return _lib.llama_set_state_data(ctx, src)
+
+
+_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p]
+_lib.llama_set_state_data.restype = c_size_t
+
+
+# Save/load session file
+# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+def llama_load_session_file(
+    ctx: llama_context_p,
+    path_session: bytes,
+    tokens_out,  # type: Array[llama_token]
+    n_token_capacity: c_size_t,
+    n_token_count_out,  # type: _Pointer[c_size_t]
+) -> int:
+    return _lib.llama_load_session_file(
+        ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
+    )
+
+
+_lib.llama_load_session_file.argtypes = [
+    llama_context_p,
+    c_char_p,
+    llama_token_p,
+    c_size_t,
+    c_size_t_p,
+]
+_lib.llama_load_session_file.restype = c_size_t
+
+
+# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+def llama_save_session_file(
+    ctx: llama_context_p,
+    path_session: bytes,
+    tokens,  # type: Array[llama_token]
+    n_token_count: c_size_t,
+) -> int:
+    return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
+
+
+_lib.llama_save_session_file.argtypes = [
+    llama_context_p,
+    c_char_p,
+    llama_token_p,
+    c_size_t,
+]
+_lib.llama_save_session_file.restype = c_size_t
+
+
+# Run the llama inference to obtain the logits and probabilities for the next token.
+# tokens + n_tokens is the provided batch of new tokens to process
+# n_past is the number of tokens to use from previous eval calls
+# Returns 0 on success
+# LLAMA_API int llama_eval(
+#         struct llama_context * ctx,
+#            const llama_token * tokens,
+#                          int   n_tokens,
+#                          int   n_past,
+#                          int   n_threads);
+def llama_eval(
+    ctx: llama_context_p,
+    tokens,  # type: Array[llama_token]
+    n_tokens: c_int,
+    n_past: c_int,
+    n_threads: c_int,
+) -> int:
+    return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
+
+
+_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
+_lib.llama_eval.restype = c_int
+
+
+# Convert the provided text into tokens.
+# The tokens pointer must be large enough to hold the resulting tokens.
+# Returns the number of tokens on success, no more than n_max_tokens
+# Returns a negative number on failure - the number of tokens that would have been returned
+# TODO: not sure if correct
+# LLAMA_API int llama_tokenize(
+#         struct llama_context * ctx,
+#                   const char * text,
+#                  llama_token * tokens,
+#                          int   n_max_tokens,
+#                         bool   add_bos);
+def llama_tokenize(
+    ctx: llama_context_p,
+    text: bytes,
+    tokens,  # type: Array[llama_token]
+    n_max_tokens: c_int,
+    add_bos: c_bool,
+) -> int:
+    return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
+
+
+_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
+_lib.llama_tokenize.restype = c_int
+
+
+# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+def llama_n_vocab(ctx: llama_context_p) -> int:
+    return _lib.llama_n_vocab(ctx)
+
+
+_lib.llama_n_vocab.argtypes = [llama_context_p]
+_lib.llama_n_vocab.restype = c_int
+
+
+# LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+def llama_n_ctx(ctx: llama_context_p) -> int:
+    return _lib.llama_n_ctx(ctx)
+
+
+_lib.llama_n_ctx.argtypes = [llama_context_p]
+_lib.llama_n_ctx.restype = c_int
+
+
+# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+def llama_n_embd(ctx: llama_context_p) -> int:
+    return _lib.llama_n_embd(ctx)
+
+
+_lib.llama_n_embd.argtypes = [llama_context_p]
+_lib.llama_n_embd.restype = c_int
+
+
+# Token logits obtained from the last call to llama_eval()
+# The logits for the last token are stored in the last row
+# Can be mutated in order to change the probabilities of the next token
+# Rows: n_tokens
+# Cols: n_vocab
+# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
+def llama_get_logits(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
+    return _lib.llama_get_logits(ctx)
+
+
+_lib.llama_get_logits.argtypes = [llama_context_p]
+_lib.llama_get_logits.restype = c_float_p
+
+
+# Get the embeddings for the input
+# shape: [n_embd] (1-dimensional)
+# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+def llama_get_embeddings(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
+    return _lib.llama_get_embeddings(ctx)
+
+
+_lib.llama_get_embeddings.argtypes = [llama_context_p]
+_lib.llama_get_embeddings.restype = c_float_p
+
+
+# Token Id -> String. Uses the vocabulary in the provided context
+# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
+    return _lib.llama_token_to_str(ctx, token)
+
+
+_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
+_lib.llama_token_to_str.restype = c_char_p
+
+# Special tokens
+
+
+# LLAMA_API llama_token llama_token_bos();
+def llama_token_bos() -> int:
+    return _lib.llama_token_bos()
+
+
+_lib.llama_token_bos.argtypes = []
+_lib.llama_token_bos.restype = llama_token
+
+
+# LLAMA_API llama_token llama_token_eos();
+def llama_token_eos() -> int:
+    return _lib.llama_token_eos()
+
+
+_lib.llama_token_eos.argtypes = []
+_lib.llama_token_eos.restype = llama_token
+
+
+# LLAMA_API llama_token llama_token_nl();
+def llama_token_nl() -> int:
+    return _lib.llama_token_nl()
+
+
+_lib.llama_token_nl.argtypes = []
+_lib.llama_token_nl.restype = llama_token
+
+
+# Sampling functions
+
+
+# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
+def llama_sample_repetition_penalty(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    last_tokens_data,  # type: Array[llama_token]
+    last_tokens_size: c_int,
+    penalty: c_float,
+):
+    return _lib.llama_sample_repetition_penalty(
+        ctx, candidates, last_tokens_data, last_tokens_size, penalty
+    )
+
+
+_lib.llama_sample_repetition_penalty.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    llama_token_p,
+    c_int,
+    c_float,
+]
+_lib.llama_sample_repetition_penalty.restype = None
+
+
+# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+def llama_sample_frequency_and_presence_penalties(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    last_tokens_data,  # type: Array[llama_token]
+    last_tokens_size: c_int,
+    alpha_frequency: c_float,
+    alpha_presence: c_float,
+):
+    return _lib.llama_sample_frequency_and_presence_penalties(
+        ctx,
+        candidates,
+        last_tokens_data,
+        last_tokens_size,
+        alpha_frequency,
+        alpha_presence,
+    )
+
+
+_lib.llama_sample_frequency_and_presence_penalties.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    llama_token_p,
+    c_int,
+    c_float,
+    c_float,
+]
+_lib.llama_sample_frequency_and_presence_penalties.restype = None
+
+
+# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+def llama_sample_softmax(
+    ctx: llama_context_p, candidates  # type: _Pointer[llama_token_data]
+):
+    return _lib.llama_sample_softmax(ctx, candidates)
+
+
+_lib.llama_sample_softmax.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+]
+_lib.llama_sample_softmax.restype = None
+
+
+# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
+def llama_sample_top_k(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    k: c_int,
+    min_keep: c_size_t,
+):
+    return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
+
+
+_lib.llama_sample_top_k.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_int,
+    c_size_t,
+]
+_lib.llama_sample_top_k.restype = None
+
+
+# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+def llama_sample_top_p(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    p: c_float,
+    min_keep: c_size_t,
+):
+    return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
+
+
+_lib.llama_sample_top_p.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_size_t,
+]
+_lib.llama_sample_top_p.restype = None
+
+
+# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
+def llama_sample_tail_free(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    z: c_float,
+    min_keep: c_size_t,
+):
+    return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
+
+
+_lib.llama_sample_tail_free.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_size_t,
+]
+_lib.llama_sample_tail_free.restype = None
+
+
+# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+def llama_sample_typical(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    p: c_float,
+    min_keep: c_size_t,
+):
+    return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
+
+
+_lib.llama_sample_typical.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_size_t,
+]
+_lib.llama_sample_typical.restype = None
+
+
+# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+def llama_sample_temperature(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    temp: c_float,
+):
+    return _lib.llama_sample_temperature(ctx, candidates, temp)
+
+
+_lib.llama_sample_temperature.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+]
+_lib.llama_sample_temperature.restype = None
+
+
+# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+def llama_sample_token_mirostat(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    tau: c_float,
+    eta: c_float,
+    m: c_int,
+    mu,  # type: _Pointer[c_float]
+) -> int:
+    return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
+
+
+_lib.llama_sample_token_mirostat.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_float,
+    c_int,
+    c_float_p,
+]
+_lib.llama_sample_token_mirostat.restype = llama_token
+
+
+# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+def llama_sample_token_mirostat_v2(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    tau: c_float,
+    eta: c_float,
+    mu,  # type: _Pointer[c_float]
+) -> int:
+    return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
+
+
+_lib.llama_sample_token_mirostat_v2.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_float,
+    c_float_p,
+]
+_lib.llama_sample_token_mirostat_v2.restype = llama_token
+
+
+# @details Selects the token with the highest probability.
+# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+def llama_sample_token_greedy(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+) -> int:
+    return _lib.llama_sample_token_greedy(ctx, candidates)
+
+
+_lib.llama_sample_token_greedy.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+]
+_lib.llama_sample_token_greedy.restype = llama_token
+
+
+# @details Randomly selects a token from the candidates based on their probabilities.
+# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+def llama_sample_token(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+) -> int:
+    return _lib.llama_sample_token(ctx, candidates)
+
+
+_lib.llama_sample_token.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+]
+_lib.llama_sample_token.restype = llama_token
+
+
+# Performance information
+
+
+# LLAMA_API void llama_print_timings(struct llama_context * ctx);
+def llama_print_timings(ctx: llama_context_p):
+    _lib.llama_print_timings(ctx)
+
+
+_lib.llama_print_timings.argtypes = [llama_context_p]
+_lib.llama_print_timings.restype = None
+
+
+# LLAMA_API void llama_reset_timings(struct llama_context * ctx);
+def llama_reset_timings(ctx: llama_context_p):
+    _lib.llama_reset_timings(ctx)
+
+
+_lib.llama_reset_timings.argtypes = [llama_context_p]
+_lib.llama_reset_timings.restype = None
+
+
+# Print system information
+# LLAMA_API const char * llama_print_system_info(void);
+def llama_print_system_info() -> bytes:
+    return _lib.llama_print_system_info()
+
+
+_lib.llama_print_system_info.argtypes = []
+_lib.llama_print_system_info.restype = c_char_p
+
+###################################################################################################
+
+
+_llama_initialized = False
+
+if not _llama_initialized:
+    llama_init_backend()
+    _llama_initialized = True
--- a/examples/low_level_api_chat_cpp.py
+++ b/examples/low_level_api_chat_cpp.py
@ -0,0 +1,573 @@
+"""
+This is an example implementation of main.cpp from llama.cpp
+Quirks:
+ * Its not exactly alike since this port is designed around programmatic I/O
+ * Input is always echoed if on, so it should be turned off when using "input()"
+ * The first antiprompt should be the userprompt like "\nUser:",
+   because its added when n_predict is reached (aka generation ended prematurely)
+ * n_predict can be set to -1 for unlimited length responses (or just a really high value)
+ * Instruction mode adds its own antiprompt.
+   You should also still be feeding the model with a "primer" prompt that
+   shows it the expected format.
+"""
+import ctypes
+import sys
+from time import time
+from os import cpu_count, path
+
+import llama_cpp
+from common import GptParams, gpt_params_parse, gpt_random_prompt
+
+ANSI_COLOR_RESET = "\x1b[0m"
+ANSI_COLOR_YELLOW = "\x1b[33m"
+ANSI_BOLD = "\x1b[1m"
+ANSI_COLOR_GREEN = "\x1b[32m"
+
+CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
+CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
+CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
+
+# Iterative search
+# Actively searches and prevents a pattern from being returned
+class IterSearch:
+    def __init__(self, pattern):
+        self.pattern = list(pattern)
+        self.buffer = []
+
+    def __call__(self, char):
+        self.buffer += [char]
+
+        if (self.pattern[:len(self.buffer)] == self.buffer):
+            if (len(self.buffer) >= len(self.pattern)):
+                self.buffer.clear()
+            return []
+
+        _tmp = self.buffer[:]
+        self.buffer.clear()
+        return _tmp
+
+# A LLaMA interactive session
+class LLaMAInteract:
+    def __init__(self, params: GptParams) -> None:
+        # input args
+        self.params = params
+
+        if (self.params.perplexity):
+            raise NotImplementedError("""************
+please use the 'perplexity' tool for perplexity calculations
+************""")
+
+        if (self.params.embedding):
+            raise NotImplementedError("""************
+please use the 'embedding' tool for embedding calculations
+************""")
+
+        if (self.params.n_ctx > 2048):
+            print(f"""warning: model does not support \
+context sizes greater than 2048 tokens ({self.params.n_ctx} \
+specified) expect poor results""", file=sys.stderr)
+
+        if (self.params.seed <= 0):
+            self.params.seed = int(time())
+
+        print(f"seed = {self.params.seed}", file=sys.stderr)
+
+        if (self.params.random_prompt):
+            self.params.prompt = gpt_random_prompt(self.params.seed)
+
+        # runtime args
+        self.input_consumed = 0
+        self.n_past = 0
+        self.n_session_consumed = 0
+        self.first_antiprompt = []
+        self.remaining_tokens = self.params.n_predict
+        self.output_echo = self.params.input_echo
+
+        # model load
+        self.lparams = llama_cpp.llama_context_default_params()
+        self.lparams.n_ctx = self.params.n_ctx
+        self.lparams.n_parts = self.params.n_parts
+        self.lparams.seed = self.params.seed
+        self.lparams.memory_f16 = self.params.memory_f16
+        self.lparams.use_mlock = self.params.use_mlock
+        self.lparams.use_mmap = self.params.use_mmap
+
+        self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
+        if (not self.ctx):
+            raise RuntimeError(f"error: failed to load model '{self.params.model}'")
+
+        if (self.params.ignore_eos):
+            self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
+
+        if (len(self.params.lora_adapter) > 0):
+            if (llama_cpp.llama_apply_lora_from_file(
+                self.ctx,
+                self.params.lora_adapter.encode("utf8"),
+                self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None,
+                self.params.n_threads
+            ) != 0):
+                print("error: failed to apply lora adapter")
+                return
+
+        print(file=sys.stderr)
+        print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
+| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
+
+        # determine the required inference memory per token:
+        if (self.params.mem_test):
+            tmp = [0, 1, 2, 3]
+            llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
+            llama_cpp.llama_print_timings(self.ctx)
+            self.exit()
+            return
+
+        # create internal context
+        self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
+
+        # Add a space in front of the first character to match OG llama tokenizer behavior
+        self.params.prompt = " " + self.params.prompt
+
+        # Load prompt file
+        if (self.params.file):
+            with open(self.params.file) as f:
+                self.params.prompt = f.read()
+
+        self.session_tokens: list[llama_cpp.llama_token] = []
+        if (len(self.params.path_session) > 0):
+            print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr)
+
+            if (path.exists(self.params.path_session)):
+                _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
+                _n_token_count_out = llama_cpp.c_size_t()
+                if (llama_cpp.llama_load_session_file(
+                    self.ctx,
+                    self.params.path_session.encode("utf8"),
+                    _session_tokens,
+                    self.params.n_ctx,
+                    ctypes.byref(_n_token_count_out)
+                ) != 1):
+                    print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
+                    return
+                _n_token_count_out = _n_token_count_out.value
+                self.session_tokens = _session_tokens[:_n_token_count_out]
+                print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
+            else:
+                print(f"session file does not exist, will create", file=sys.stderr)
+
+        # tokenize the prompt
+        self.embd = []
+        self.embd_inp = self._tokenize(self.params.prompt)
+
+        if (len(self.embd_inp) > self.n_ctx - 4):
+            raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
+
+        # debug message about similarity of saved session, if applicable
+        self.n_matching_session_tokens = 0
+        if len(self.session_tokens) > 0:
+            for id in self.session_tokens:
+                if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]:
+                    break
+                self.n_matching_session_tokens += 1
+
+            if self.n_matching_session_tokens >= len(self.embd_inp):
+                print(f"session file has exact match for prompt!")
+            elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
+                print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
+            else:
+                print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+
+        self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
+
+        # number of tokens to keep when resetting context
+        if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
+            self.params.n_keep = len(self.embd_inp)
+
+        self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix)
+        self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
+
+        # in instruct mode, we inject a prefix and a suffix to each input by the user
+        self.antiecho = None
+        if (self.params.instruct):
+            self.params.interactive_start = True
+            _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
+            self.first_antiprompt.append(_ptn)
+            self.antiecho = IterSearch(_ptn)
+
+        # enable interactive mode if reverse prompt or interactive start is specified
+        if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
+            self.params.interactive = True
+
+        # determine newline token
+        self.llama_token_newline = self._tokenize("\n", False)
+        self.llama_token_eot = self._tokenize(" [end of text]\n", False)
+
+        if (self.params.verbose_prompt):
+            print(f"""
+prompt: '{self.params.prompt}'
+number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
+
+            for i in range(len(self.embd_inp)):
+                print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr)
+
+            if (self.params.n_keep > 0):
+                print("static prompt based on n_keep: '")
+                for i in range(self.params.n_keep):
+                    print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr)
+                print("'", file=sys.stderr)
+            print(file=sys.stderr)
+
+        if (self.params.interactive):
+            print("interactive mode on.", file=sys.stderr)
+
+            if (len(self.params.antiprompt) > 0):
+                for antiprompt in self.params.antiprompt:
+                    print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr)
+
+            if len(self.params.input_prefix) > 0:
+                print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
+
+        print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
+repeat_penalty = {self.params.repeat_penalty},\
+presence_penalty = {self.params.presence_penalty},\
+frequency_penalty = {self.params.frequency_penalty},\
+top_k = {self.params.top_k},\
+tfs_z = {self.params.tfs_z},\
+top_p = {self.params.top_p},\
+typical_p = {self.params.typical_p},\
+temp = {self.params.temp},\
+mirostat = {self.params.mirostat},\
+mirostat_lr = {self.params.mirostat_eta},\
+mirostat_ent = {self.params.mirostat_tau},\
+
+generate: n_ctx = {self.n_ctx},\
+n_batch = {self.params.n_batch},\
+n_predict = {self.params.n_predict},\
+n_keep = {self.params.n_keep}
+
+""", file=sys.stderr)
+
+        # determine antiprompt tokens
+        for i in self.params.antiprompt:
+            self.first_antiprompt.append(self._tokenize(i, False))
+
+        self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
+
+        if (params.interactive):
+            print("""== Running in interactive mode. ==
+ - Press Ctrl+C to interject at any time.
+ - Press Return to return control to LLaMa.
+ - If you want to submit another line, end your input in '\\'.
+
+""", file=sys.stderr)
+        self.set_color(CONSOLE_COLOR_PROMPT)
+
+    # tokenize a prompt
+    def _tokenize(self, prompt, bos=True):
+        _arr = (llama_cpp.llama_token * (len(prompt) + 1))()
+        _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
+        return _arr[:_n]
+
+    def set_color(self, c):
+        if (self.params.use_color):
+            print(c, end="")
+
+    def use_antiprompt(self):
+        return len(self.first_antiprompt) > 0
+
+    # generate tokens
+    def generate(self):
+        while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1:
+            # predict
+            if len(self.embd) > 0:
+                # infinite text generation via context swapping
+                # if we run out of context:
+                # - take the n_keep first tokens from the original prompt (via n_past)
+                # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+                if (self.n_past + len(self.embd) > self.n_ctx):
+                    n_left = self.n_past - self.params.n_keep
+                    self.n_past = self.params.n_keep
+
+                    # insert n_left/2 tokens at the start of embd from last_n_tokens
+                    _insert = self.last_n_tokens[
+                        self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
+                    ]
+                    self.embd = _insert + self.embd
+                    self.params.path_session = ""
+
+                # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+                # REVIEW
+                if self.n_session_consumed < len(self.session_tokens):
+                    for i in range(len(self.embd)):
+                        if self.embd[i] != self.session_tokens[self.n_session_consumed]:
+                            self.session_tokens = self.session_tokens[:self.n_session_consumed]
+                            break
+
+                        self.n_past += 1
+                        self.n_session_consumed += 1
+
+                        if self.n_session_consumed >= len(self.session_tokens):
+                            i += 1
+                            break
+
+                    if i > 0:
+                        self.embd = self.embd[i:]
+
+                # evaluate tokens in batches
+                # embd is typically prepared beforehand to fit within a batch, but not always
+                #TODO BUG: The batching code causes nonsensical generation
+                """for i in range(0, len(self.embd), self.params.n_batch):
+                    n_eval = self.params.n_batch
+                    _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
+                    if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
+                        print(f"failed to eval")
+                        return
+
+                    self.n_past += n_eval"""
+
+                if (llama_cpp.llama_eval(
+                    self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
+                ) != 0):
+                    raise Exception("Failed to llama_eval!")
+
+                if len(self.embd) > 0 and len(self.params.path_session) > 0:
+                    self.session_tokens.extend(self.embd)
+                    self.n_session_consumed = len(self.session_tokens)
+
+            self.n_past += len(self.embd)
+            self.embd = []
+            if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting
+                # out of user input, sample next token
+                top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k
+                repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n
+
+                # optionally save the session on first sample (for faster prompt loading next time)
+                if len(self.params.path_session) > 0 and self.need_to_save_session:
+                    self.need_to_save_session = False
+                    llama_cpp.llama_save_session_file(
+                        self.ctx,
+                        self.params.path_session.encode("utf8"),
+                        (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens),
+                        len(self.session_tokens)
+                    )
+
+                id = 0
+
+                logits = llama_cpp.llama_get_logits(self.ctx)
+                n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+
+                # Apply params.logit_bias map
+                for key, value in self.params.logit_bias.items():
+                    logits[key] += value
+
+                _arr = (llama_cpp.llama_token_data * n_vocab)(*[
+                    llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+                    for token_id in range(n_vocab)
+                ])
+                candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+                # Apply penalties
+                nl_logit = logits[llama_cpp.llama_token_nl()]
+                last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
+
+                _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
+                llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
+                    _arr,
+                    last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
+                llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
+                    _arr,
+                    last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
+
+                if not self.params.penalize_nl:
+                    logits[llama_cpp.llama_token_nl()] = nl_logit
+
+                if self.params.temp <= 0:
+                    # Greedy sampling
+                    id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
+                else:
+                    if self.params.mirostat == 1:
+                        mirostat_mu = 2.0 * self.params.mirostat_tau
+                        mirostat_m = 100
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+                        id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu))
+                    elif self.params.mirostat == 2:
+                        mirostat_mu = 2.0 * self.params.mirostat_tau
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+                        id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
+                    else:
+                        # Temperature sampling
+                        llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
+                        llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z),min_keep=llama_cpp.c_size_t(1))
+                        llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p),min_keep=llama_cpp.c_size_t(1))
+                        llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p),min_keep=llama_cpp.c_size_t(1))
+                        llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+                        id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
+                # print("`{}`".format(candidates_p.size))
+
+                self.last_n_tokens.pop(0)
+                self.last_n_tokens.append(id)
+
+                # replace end of text token with newline token when in interactive mode
+                if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
+                    id = self.llama_token_newline[0]
+                    self.embd.append(id)
+                    if (self.use_antiprompt()):
+                        # tokenize and inject first reverse prompt
+                        self.embd_inp += self.first_antiprompt[0]
+                        for id in self.first_antiprompt[0]:
+                            self.embd.append(id)
+                else:
+                    # add it to the context
+                    self.embd.append(id)
+
+                # echo this to console
+                self.output_echo = True
+
+                # decrement remaining sampling budget
+                self.remaining_tokens -= 1
+            else:
+                # output to console if input echo is on
+                self.output_echo = self.params.input_echo
+
+                # some user input remains from prompt or interaction, forward it to processing
+                while len(self.embd_inp) > self.input_consumed:
+                    self.embd.append(self.embd_inp[self.input_consumed])
+                    self.last_n_tokens.pop(0)
+                    self.last_n_tokens.append(self.embd_inp[self.input_consumed])
+                    self.input_consumed += 1
+                    if len(self.embd) >= self.params.n_batch:
+                        break
+
+            # display tokens
+            if self.output_echo:
+                for id in self.embd:
+                    if self.antiecho != None:
+                        for r in self.antiecho(id):
+                            yield r
+                    else:
+                        yield id
+
+            # reset color to default if we there is no pending user input
+            if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
+                self.set_color(CONSOLE_COLOR_DEFAULT)
+
+            if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
+                # if antiprompt is present, stop
+                if (self.use_antiprompt()):
+                    if True in [
+                        i == self.last_n_tokens[-len(i):]
+                        for i in self.first_antiprompt
+                    ]:
+                        break
+
+                # if we are using instruction mode, and we have processed the initial prompt
+                if (self.params.interactive_start):
+                    break
+
+            # end of text token
+            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
+                if (not self.params.instruct):
+                    for i in self.llama_token_eot:
+                        yield i
+                    break
+
+            # respect n_predict even if antiprompt is present
+            if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
+                # If we arent in instruction mode, fix the current generation by appending the antiprompt.
+                # Makes it so if chat ends prematurely you dont append the AI's text etc.
+                if not self.params.instruct:
+                    self.embd_inp += self.first_antiprompt[0]
+                self.n_remain = self.params.n_predict
+                break
+
+        self.params.interactive_start = False
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, tb):
+        self.exit()
+
+    def exit(self):
+        llama_cpp.llama_free(self.ctx)
+        self.set_color(CONSOLE_COLOR_DEFAULT)
+
+    # return past text
+    def past(self):
+        for id in self.last_n_tokens[-self.n_past:]:
+            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
+
+    # write input
+    def input(self, prompt: str):
+        if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
+            self.embd_inp += self.inp_prefix
+        self.embd_inp += self._tokenize(prompt)
+        if (self.params.instruct):
+            self.embd_inp += self.inp_suffix
+
+    # write output
+    def output(self):
+        self.remaining_tokens = self.params.n_predict
+        for id in self.generate():
+            yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+
+    # read user input
+    def read_input(self):
+        out = ""
+        while (t := input()).endswith("\\"):
+            out += t[:-1] + "\n"
+        return out + t + "\n"
+
+    # interactive mode
+    def interact(self):
+        for i in self.output():
+            print(i,end="",flush=True)
+        self.params.input_echo = False
+
+        while self.params.interactive:
+            self.set_color(CONSOLE_COLOR_USER_INPUT)
+            if (self.params.instruct):
+                print('\n> ', end="")
+                self.input(self.read_input())
+            else:
+                print(self.params.input_prefix, end="")
+                self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
+                print(self.params.input_suffix,end="")
+            self.set_color(CONSOLE_COLOR_DEFAULT)
+
+            try:
+                for i in self.output():
+                    print(i,end="",flush=True)
+            except KeyboardInterrupt:
+                self.set_color(CONSOLE_COLOR_DEFAULT)
+                if not self.params.instruct:
+                    print(self.params.fix_prefix,end="")
+                    self.input(self.params.fix_prefix)
+
+if __name__ == "__main__":
+    from datetime import datetime
+
+    USER_NAME="User"
+    AI_NAME="ChatLLaMa"
+
+    time_now = datetime.now()
+    prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+{USER_NAME}: Hello, {AI_NAME}!
+{AI_NAME}: Hello {USER_NAME}! How may I help you today?
+{USER_NAME}: What time is it?
+{AI_NAME}: It is {time_now.strftime("%H:%M")}.
+{USER_NAME}: What year is it?
+{AI_NAME}: We are in {time_now.strftime("%Y")}.
+{USER_NAME}: What is a cat?
+{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{USER_NAME}: Name a color.
+{AI_NAME}: Blue
+{USER_NAME}:"""
+    params = gpt_params_parse()
+
+    with LLaMAInteract(params) as m:
+        m.interact()
--- a/examples/low_level_api_llama_cpp.py
+++ b/examples/low_level_api_llama_cpp.py
@ -0,0 +1,102 @@
+import llama_cpp
+
+import multiprocessing
+
+import llama_cpp
+
+N_THREADS = multiprocessing.cpu_count()
+
+prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
+
+lparams = llama_cpp.llama_context_default_params()
+ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams)
+
+# determine the required inference memory per token:
+tmp = [0, 1, 2, 3]
+llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS)
+
+n_past = 0
+
+prompt = b" " + prompt
+
+embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
+n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True)
+embd_inp = embd_inp[:n_of_tok]
+
+n_ctx = llama_cpp.llama_n_ctx(ctx)
+
+n_predict = 20
+n_predict = min(n_predict, n_ctx - len(embd_inp))
+
+input_consumed = 0
+input_noecho = False
+
+remaining_tokens = n_predict
+
+embd = []
+last_n_size = 64
+last_n_tokens_data = [0] * last_n_size
+n_batch = 24
+last_n_repeat = 64
+repeat_penalty = 1
+frequency_penalty = 0.0
+presence_penalty = 0.0
+
+while remaining_tokens > 0:
+    if len(embd) > 0:
+        llama_cpp.llama_eval(
+            ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS
+        )
+
+    n_past += len(embd)
+    embd = []
+    if len(embd_inp) <= input_consumed:
+        logits = llama_cpp.llama_get_logits(ctx)
+        n_vocab = llama_cpp.llama_n_vocab(ctx)
+
+        _arr = (llama_cpp.llama_token_data * n_vocab)(*[
+            llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+            for token_id in range(n_vocab)
+        ])
+        candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+        _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
+        llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
+            _arr,
+            last_n_repeat, repeat_penalty)
+        llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
+            _arr,
+            last_n_repeat, frequency_penalty, presence_penalty)
+
+        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40, min_keep=llama_cpp.c_size_t(1))
+        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, min_keep=llama_cpp.c_size_t(1))
+        llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
+        id = llama_cpp.llama_sample_token(ctx, candidates_p)
+
+        last_n_tokens_data = last_n_tokens_data[1:] + [id]
+        embd.append(id)
+        input_noecho = False
+        remaining_tokens -= 1
+    else:
+        while len(embd_inp) > input_consumed:
+            embd.append(embd_inp[input_consumed])
+            last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
+            input_consumed += 1
+            if len(embd) >= n_batch:
+                break
+    if not input_noecho:
+        for id in embd:
+            print(
+                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
+                end="",
+                flush=True,
+            )
+
+    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos():
+        break
+
+print()
+
+llama_cpp.llama_print_timings(ctx)
+
+llama_cpp.llama_free(ctx)
--- a/examples/quantize.py
+++ b/examples/quantize.py
@ -0,0 +1,25 @@
+import os
+import argparse
+import llama_cpp
+
+
+def main(args):
+    if not os.path.exists(fname_inp):
+        raise RuntimeError(f"Input file does not exist ({fname_inp})")
+    if os.path.exists(fname_out):
+        raise RuntimeError(f"Output file already exists ({fname_out})")
+    fname_inp = args.fname_inp.encode("utf-8")
+    fname_out = args.fname_out.encode("utf-8")
+    itype = args.itype
+    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
+    if return_code != 0:
+        raise RuntimeError("Failed to quantize model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("fname_inp", type=str, help="Path to input model")
+    parser.add_argument("fname_out", type=str, help="Path to output model")
+    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
+    args = parser.parse_args()
+    main(args)