This commit is contained in:
Don Mahurin 2025-02-09 02:19:48 +05:30 committed by GitHub
commit 495c32cbe4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 1943 additions and 0 deletions

71
examples/Chat.py Normal file
View file

@ -0,0 +1,71 @@
#!/bin/python
import sys, os, datetime
from common import GptParams
from low_level_api_chat_cpp import LLaMAInteract
def env_or_def(env, default):
if (env in os.environ):
return os.environ[env]
return default
AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
USER_NAME = env_or_def("USER_NAME", "USER")
N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
N_THREAD = int(env_or_def("N_THREAD", "8"))
today = datetime.datetime.today()
DATE_YEAR=today.strftime("%Y")
DATE_TIME=today.strftime("%H:%M")
prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.
{USER_NAME}: Hello, {AI_NAME}!
{AI_NAME}: Hello {USER_NAME}! How may I help you today?
{USER_NAME}: What year is it?
{AI_NAME}: We are in {DATE_YEAR}.
{USER_NAME}: Please tell me the largest city in Europe.
{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
{USER_NAME}: What can you tell me about Moscow?
{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russias symbolic center.
{USER_NAME}: What is a cat?
{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{USER_NAME}: How do I pass command line arguments to a Node.js program?
{AI_NAME}: The arguments are stored in process.argv.
argv[0] is the path to the Node. js executable.
argv[1] is the path to the script file.
argv[2] is the first argument passed to the script.
argv[3] is the second argument passed to the script and so on.
{USER_NAME}: Name a color.
{AI_NAME}: Blue.
{USER_NAME}: What time is it?
{AI_NAME}: It is {DATE_TIME}.
{USER_NAME}:""" + " ".join(sys.argv[1:])
print("Loading model...")
params = GptParams(
n_ctx=2048,
temp=0.7,
top_k=40,
top_p=0.5,
repeat_last_n=256,
n_batch=1024,
repeat_penalty=1.17647,
model=MODEL,
n_threads=N_THREAD,
n_predict=N_PREDICTS,
use_color=True,
interactive=True,
antiprompt=[f"{USER_NAME}:"],
input_prefix=" ",
input_suffix=f"{AI_NAME}:",
prompt=prompt,
)
with LLaMAInteract(params) as m:
m.interact()

59
examples/Miku.py Normal file
View file

@ -0,0 +1,59 @@
#!/bin/python
import sys, os
from common import GptParams
from low_level_api_chat_cpp import LLaMAInteract
def env_or_def(env, default):
if (env in os.environ):
return os.environ[env]
return default
AI_NAME = env_or_def("AI_NAME", "Miku")
MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
USER_NAME = env_or_def("USER_NAME", "Anon")
N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
N_THREAD = int(env_or_def("N_THREAD", "0"))
prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
The conversation is only between {USER_NAME} and {AI_NAME}
The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
{AI_NAME} can only communicate through text, so she can't send images or videos.
{USER_NAME}: Hello!
{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
{AI_NAME}: What do you like to do in your free time? ^_^
{USER_NAME}:""" + " ".join(sys.argv[1:])
print("Loading model...")
params = GptParams(
n_batch=1024,
n_ctx=2048,
n_keep=-1,
repeat_last_n=256,
repeat_penalty=1.17647,
temp=0.7,
top_k=40,
top_p=0.5,
model=MODEL,
n_predict=N_PREDICTS,
use_color=True,
interactive=True,
antiprompt=[f"{USER_NAME}:"],
prompt=prompt,
)
if N_THREAD > 0:
params.n_threads = N_THREAD
with LLaMAInteract(params) as m:
m.interact()

49
examples/ReasonAct.py Normal file
View file

@ -0,0 +1,49 @@
#!/bin/python
import sys, os, datetime
from common import GptParams
from low_level_api_chat_cpp import LLaMAInteract
def env_or_def(env, default):
if (env in os.environ):
return os.environ[env]
return default
MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
prompt=f"""You run in a loop of Thought, Action, Observation.
At the end of the loop either Answer or restate your Thought and Action.
Use Thought to describe your thoughts about the question you have been asked.
Use Action to run one of these actions available to you:
- calculate[python math expression]
Observation will be the result of running those actions
Question: What is 4 * 7 / 3?
Thought: Do I need to use an action? Yes, I use calculate to do math
Action: calculate[4 * 7 / 3]
Observation: 9.3333333333
Thought: Do I need to use an action? No, have the result
Answer: The calculate tool says it is 9.3333333333
Question: What is capital of france?
Thought: Do I need to use an action? No, I know the answer
Answer: Paris is the capital of France
Question:""" + " ".join(sys.argv[1:])
print("Loading model...")
params = GptParams(
interactive=True,
interactive_start=True,
top_k=10000,
temp=0.2,
repeat_penalty=1,
n_threads=7,
n_ctx=2048,
antiprompt=["Question:","Observation:"],
model=MODEL,
input_prefix=" ",
n_predict=-1,
prompt=prompt,
)
with LLaMAInteract(params) as m:
m.interact()

202
examples/common.py Normal file
View file

@ -0,0 +1,202 @@
import os
import argparse
import re
from dataclasses import dataclass, field
from typing import List
# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
@dataclass
class GptParams:
seed: int = -1
n_threads: int = min(4, os.cpu_count() or 1)
n_predict: int = 128
n_parts: int = -1
n_ctx: int = 512
n_batch: int = 8
n_keep: int = 0
ignore_eos: bool = False
logit_bias: dict[int, float] = field(default_factory=dict)
top_k: int = 40
top_p: float = 0.95
tfs_z: float = 1.00
typical_p: float = 1.00
temp: float = 0.80
repeat_penalty: float = 1.10
repeat_last_n: int = 64
frequency_penalty: float = 0.0
presence_penalty: float = 0.0
mirostat: int = 0
mirostat_tau: float = 5.0
mirostat_eta: float = 0.1
model: str = "./models/llama-7B/ggml-model.bin"
prompt: str = ""
path_session: str = ""
input_prefix: str = " "
input_suffix: str = ""
antiprompt: List[str] = field(default_factory=list)
lora_adapter: str = ""
lora_base: str = ""
memory_f16: bool = True
random_prompt: bool = False
use_color: bool = False
interactive: bool = False
embedding: bool = False
interactive_start: bool = False
instruct: bool = False
penalize_nl: bool = True
perplexity: bool = False
use_mmap: bool = True
use_mlock: bool = False
mem_test: bool = False
verbose_prompt: bool = False
file: str = None
# If chat ended prematurely, append this to the conversation to fix it.
# Set to "\nUser:" etc.
# This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
fix_prefix: str = ""
input_echo: bool = True,
# Default instructions for Alpaca
# switch to "Human" and "Assistant" for Vicuna.
# TODO: TBD how they are gonna handle this upstream
instruct_inp_prefix: str="\n\n### Instruction:\n\n"
instruct_inp_suffix: str="\n\n### Response:\n\n"
def gpt_params_parse(argv = None):
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
parser.add_argument(
"-l",
"--logit-bias",
type=str,
action='append',
help="--logit-bias TOKEN_ID(+/-)BIAS",
dest="logit_bias_str"
)
parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z")
parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau")
parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
parser.add_argument("--session", type=str, default="", help="file to cache model state in (may be large!)",dest="path_session")
parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
parser.add_argument(
"-r",
"--reverse-prompt",
type=str,
action='append',
help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
dest="antiprompt"
)
parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter")
parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base")
parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
parser.add_argument(
"--color",
action="store_true",
help="colorise output to distinguish prompt and user input from generations",
dest="use_color"
)
parser.add_argument(
"-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
)
parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
parser.add_argument(
"--interactive-first",
action="store_true",
help="run in interactive mode and wait for input right away",
dest="interactive_start"
)
parser.add_argument(
"-ins",
"--instruct",
action="store_true",
help="run in instruction mode (use with Alpaca or Vicuna models)",
dest="instruct"
)
parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl")
parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
#Custom args
parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
parser.add_argument(
"--interactive-start",
action="store_true",
help="run in interactive mode",
dest="interactive"
)
args = parser.parse_args(argv)
logit_bias_str = args.logit_bias_str
delattr(args, "logit_bias_str")
params = GptParams(**vars(args))
if (params.lora_adapter):
params.use_mmap = False
if (logit_bias_str != None):
for i in logit_bias_str:
if (m := re.match(r"(\d+)([-+]\d+)", i)):
params.logit_bias[int(m.group(1))] = float(m.group(2))
return params
def gpt_random_prompt(rng):
return [
"So",
"Once upon a time",
"When",
"The",
"After",
"If",
"import",
"He",
"She",
"They",
][rng % 10]
if __name__ == "__main__":
print(gpt_params_parse())

862
examples/llama_cpp.py Normal file
View file

@ -0,0 +1,862 @@
import sys
import os
import ctypes
from ctypes import (
c_int,
c_float,
c_char_p,
c_void_p,
c_bool,
POINTER,
_Pointer, # type: ignore
Structure,
Array,
c_uint8,
c_size_t,
)
import pathlib
# Load the library
def _load_shared_library(lib_base_name: str):
# Determine the file extension based on the platform
if sys.platform.startswith("linux"):
lib_ext = ".so"
elif sys.platform == "darwin":
lib_ext = ".dylib"
elif sys.platform == "win32":
lib_ext = ".dll"
else:
raise RuntimeError("Unsupported platform")
# Construct the paths to the possible shared library names
_base_path = pathlib.Path(__file__).parent.resolve()
_base_path_parent = pathlib.Path(__file__).parent.parent.resolve()
# Searching for the library in the current directory under the name "libllama" (default name
# for llamacpp) and "llama" (default name for this repo)
_lib_paths = [
_base_path / f"lib{lib_base_name}{lib_ext}",
_base_path_parent / f"lib{lib_base_name}{lib_ext}",
_base_path / f"{lib_base_name}{lib_ext}",
]
if "LLAMA_CPP_LIB" in os.environ:
lib_base_name = os.environ["LLAMA_CPP_LIB"]
_lib = pathlib.Path(lib_base_name)
_base_path = _lib.parent.resolve()
_lib_paths = [_lib.resolve()]
cdll_args = dict() # type: ignore
# Add the library directory to the DLL search path on Windows (if needed)
if sys.platform == "win32" and sys.version_info >= (3, 8):
os.add_dll_directory(str(_base_path))
if "CUDA_PATH" in os.environ:
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
cdll_args["winmode"] = 0
# Try to load the shared library, handling potential errors
for _lib_path in _lib_paths:
if _lib_path.exists():
try:
return ctypes.CDLL(str(_lib_path), **cdll_args)
except Exception as e:
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
raise FileNotFoundError(
f"Shared library with base name '{lib_base_name}' not found"
)
# Specify the base name of the shared library to load
_lib_base_name = "llama"
# Load the library
_lib = _load_shared_library(_lib_base_name)
# Misc
c_float_p = POINTER(c_float)
c_uint8_p = POINTER(c_uint8)
c_size_t_p = POINTER(c_size_t)
# llama.h bindings
# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
# #define LLAMA_FILE_VERSION 3
LLAMA_FILE_VERSION = c_int(3)
LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
LLAMA_SESSION_VERSION = c_int(1)
# struct llama_context;
llama_context_p = c_void_p
# typedef int llama_token;
llama_token = c_int
llama_token_p = POINTER(llama_token)
# typedef struct llama_token_data {
# llama_token id; // token id
# float logit; // log-odds of the token
# float p; // probability of the token
# } llama_token_data;
class llama_token_data(Structure):
_fields_ = [
("id", llama_token),
("logit", c_float),
("p", c_float),
]
llama_token_data_p = POINTER(llama_token_data)
# typedef struct llama_token_data_array {
# llama_token_data * data;
# size_t size;
# bool sorted;
# } llama_token_data_array;
class llama_token_data_array(Structure):
_fields_ = [
("data", llama_token_data_p),
("size", c_size_t),
("sorted", c_bool),
]
llama_token_data_array_p = POINTER(llama_token_data_array)
# typedef void (*llama_progress_callback)(float progress, void *ctx);
llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
# struct llama_context_params {
# int n_ctx; // text context
# int n_gpu_layers; // number of layers to store in VRAM
# int seed; // RNG seed, -1 for random
# bool f16_kv; // use fp16 for KV cache
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
# bool vocab_only; // only load the vocabulary, no weights
# bool use_mmap; // use mmap if possible
# bool use_mlock; // force system to keep model in RAM
# bool embedding; // embedding mode only
# // called with a progress value between 0 and 1, pass NULL to disable
# llama_progress_callback progress_callback;
# // context pointer passed to the progress callback
# void * progress_callback_user_data;
# };
class llama_context_params(Structure):
_fields_ = [
("n_ctx", c_int),
("n_gpu_layers", c_int),
("seed", c_int),
("f16_kv", c_bool),
(
"logits_all",
c_bool,
),
("vocab_only", c_bool),
("use_mmap", c_bool),
("use_mlock", c_bool),
("embedding", c_bool),
("progress_callback", llama_progress_callback),
("progress_callback_user_data", c_void_p),
]
llama_context_params_p = POINTER(llama_context_params)
# enum llama_ftype {
# LLAMA_FTYPE_ALL_F32 = 0,
# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
# };
LLAMA_FTYPE_ALL_F32 = c_int(0)
LLAMA_FTYPE_MOSTLY_F16 = c_int(1)
LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)
LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)
LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)
# LLAMA_API struct llama_context_params llama_context_default_params();
def llama_context_default_params() -> llama_context_params:
return _lib.llama_context_default_params()
_lib.llama_context_default_params.argtypes = []
_lib.llama_context_default_params.restype = llama_context_params
# LLAMA_API bool llama_mmap_supported();
def llama_mmap_supported() -> bool:
return _lib.llama_mmap_supported()
_lib.llama_mmap_supported.argtypes = []
_lib.llama_mmap_supported.restype = c_bool
# LLAMA_API bool llama_mlock_supported();
def llama_mlock_supported() -> bool:
return _lib.llama_mlock_supported()
_lib.llama_mlock_supported.argtypes = []
_lib.llama_mlock_supported.restype = c_bool
# // TODO: not great API - very likely to change
# // Initialize the llama + ggml backend
# // Call once at the start of the program
# LLAMA_API void llama_init_backend();
def llama_init_backend():
return _lib.llama_init_backend()
_lib.llama_init_backend.argtypes = []
_lib.llama_init_backend.restype = None
# LLAMA_API int64_t llama_time_us();
def llama_time_us() -> int:
return _lib.llama_time_us()
_lib.llama_time_us.argtypes = []
_lib.llama_time_us.restype = ctypes.c_int64
# // Various functions for loading a ggml llama model.
# // Allocate (almost) all memory needed for the model.
# // Return NULL on failure
# LLAMA_API struct llama_context * llama_init_from_file(
# const char * path_model,
# struct llama_context_params params);
def llama_init_from_file(
path_model: bytes, params: llama_context_params
) -> llama_context_p:
return _lib.llama_init_from_file(path_model, params)
_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
_lib.llama_init_from_file.restype = llama_context_p
# Frees all allocated memory
# LLAMA_API void llama_free(struct llama_context * ctx);
def llama_free(ctx: llama_context_p):
return _lib.llama_free(ctx)
_lib.llama_free.argtypes = [llama_context_p]
_lib.llama_free.restype = None
# TODO: not great API - very likely to change
# Returns 0 on success
# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
# LLAMA_API int llama_model_quantize(
# const char * fname_inp,
# const char * fname_out,
# enum llama_ftype ftype,
# int nthread);
def llama_model_quantize(
fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
) -> int:
return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
_lib.llama_model_quantize.restype = c_int
# Apply a LoRA adapter to a loaded model
# path_base_model is the path to a higher quality model to use as a base for
# the layers modified by the adapter. Can be NULL to use the current loaded model.
# The model needs to be reloaded before applying a new adapter, otherwise the adapter
# will be applied on top of the previous one
# Returns 0 on success
# LLAMA_API int llama_apply_lora_from_file(
# struct llama_context * ctx,
# const char * path_lora,
# const char * path_base_model,
# int n_threads);
def llama_apply_lora_from_file(
ctx: llama_context_p,
path_lora: c_char_p,
path_base_model: c_char_p,
n_threads: c_int,
) -> int:
return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
_lib.llama_apply_lora_from_file.restype = c_int
# Returns the number of tokens in the KV cache
# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
return _lib.llama_get_kv_cache_token_count(ctx)
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
_lib.llama_get_kv_cache_token_count.restype = c_int
# Sets the current rng seed.
# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
return _lib.llama_set_rng_seed(ctx, seed)
_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
_lib.llama_set_rng_seed.restype = None
# Returns the maximum size in bytes of the state (rng, logits, embedding
# and kv_cache) - will often be smaller after compacting tokens
# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
def llama_get_state_size(ctx: llama_context_p) -> int:
return _lib.llama_get_state_size(ctx)
_lib.llama_get_state_size.argtypes = [llama_context_p]
_lib.llama_get_state_size.restype = c_size_t
# Copies the state to the specified destination address.
# Destination needs to have allocated enough memory.
# Returns the number of bytes copied
# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
def llama_copy_state_data(
ctx: llama_context_p, dst # type: Array[c_uint8]
) -> int:
return _lib.llama_copy_state_data(ctx, dst)
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
_lib.llama_copy_state_data.restype = c_size_t
# Set the state reading from the specified address
# Returns the number of bytes read
# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
def llama_set_state_data(
ctx: llama_context_p, src # type: Array[c_uint8]
) -> int:
return _lib.llama_set_state_data(ctx, src)
_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p]
_lib.llama_set_state_data.restype = c_size_t
# Save/load session file
# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
def llama_load_session_file(
ctx: llama_context_p,
path_session: bytes,
tokens_out, # type: Array[llama_token]
n_token_capacity: c_size_t,
n_token_count_out, # type: _Pointer[c_size_t]
) -> int:
return _lib.llama_load_session_file(
ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
)
_lib.llama_load_session_file.argtypes = [
llama_context_p,
c_char_p,
llama_token_p,
c_size_t,
c_size_t_p,
]
_lib.llama_load_session_file.restype = c_size_t
# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
def llama_save_session_file(
ctx: llama_context_p,
path_session: bytes,
tokens, # type: Array[llama_token]
n_token_count: c_size_t,
) -> int:
return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
_lib.llama_save_session_file.argtypes = [
llama_context_p,
c_char_p,
llama_token_p,
c_size_t,
]
_lib.llama_save_session_file.restype = c_size_t
# Run the llama inference to obtain the logits and probabilities for the next token.
# tokens + n_tokens is the provided batch of new tokens to process
# n_past is the number of tokens to use from previous eval calls
# Returns 0 on success
# LLAMA_API int llama_eval(
# struct llama_context * ctx,
# const llama_token * tokens,
# int n_tokens,
# int n_past,
# int n_threads);
def llama_eval(
ctx: llama_context_p,
tokens, # type: Array[llama_token]
n_tokens: c_int,
n_past: c_int,
n_threads: c_int,
) -> int:
return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
_lib.llama_eval.restype = c_int
# Convert the provided text into tokens.
# The tokens pointer must be large enough to hold the resulting tokens.
# Returns the number of tokens on success, no more than n_max_tokens
# Returns a negative number on failure - the number of tokens that would have been returned
# TODO: not sure if correct
# LLAMA_API int llama_tokenize(
# struct llama_context * ctx,
# const char * text,
# llama_token * tokens,
# int n_max_tokens,
# bool add_bos);
def llama_tokenize(
ctx: llama_context_p,
text: bytes,
tokens, # type: Array[llama_token]
n_max_tokens: c_int,
add_bos: c_bool,
) -> int:
return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
_lib.llama_tokenize.restype = c_int
# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
def llama_n_vocab(ctx: llama_context_p) -> int:
return _lib.llama_n_vocab(ctx)
_lib.llama_n_vocab.argtypes = [llama_context_p]
_lib.llama_n_vocab.restype = c_int
# LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
def llama_n_ctx(ctx: llama_context_p) -> int:
return _lib.llama_n_ctx(ctx)
_lib.llama_n_ctx.argtypes = [llama_context_p]
_lib.llama_n_ctx.restype = c_int
# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
def llama_n_embd(ctx: llama_context_p) -> int:
return _lib.llama_n_embd(ctx)
_lib.llama_n_embd.argtypes = [llama_context_p]
_lib.llama_n_embd.restype = c_int
# Token logits obtained from the last call to llama_eval()
# The logits for the last token are stored in the last row
# Can be mutated in order to change the probabilities of the next token
# Rows: n_tokens
# Cols: n_vocab
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
def llama_get_logits(
ctx: llama_context_p,
): # type: (...) -> Array[float] # type: ignore
return _lib.llama_get_logits(ctx)
_lib.llama_get_logits.argtypes = [llama_context_p]
_lib.llama_get_logits.restype = c_float_p
# Get the embeddings for the input
# shape: [n_embd] (1-dimensional)
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
def llama_get_embeddings(
ctx: llama_context_p,
): # type: (...) -> Array[float] # type: ignore
return _lib.llama_get_embeddings(ctx)
_lib.llama_get_embeddings.argtypes = [llama_context_p]
_lib.llama_get_embeddings.restype = c_float_p
# Token Id -> String. Uses the vocabulary in the provided context
# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
return _lib.llama_token_to_str(ctx, token)
_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
_lib.llama_token_to_str.restype = c_char_p
# Special tokens
# LLAMA_API llama_token llama_token_bos();
def llama_token_bos() -> int:
return _lib.llama_token_bos()
_lib.llama_token_bos.argtypes = []
_lib.llama_token_bos.restype = llama_token
# LLAMA_API llama_token llama_token_eos();
def llama_token_eos() -> int:
return _lib.llama_token_eos()
_lib.llama_token_eos.argtypes = []
_lib.llama_token_eos.restype = llama_token
# LLAMA_API llama_token llama_token_nl();
def llama_token_nl() -> int:
return _lib.llama_token_nl()
_lib.llama_token_nl.argtypes = []
_lib.llama_token_nl.restype = llama_token
# Sampling functions
# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
def llama_sample_repetition_penalty(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
last_tokens_data, # type: Array[llama_token]
last_tokens_size: c_int,
penalty: c_float,
):
return _lib.llama_sample_repetition_penalty(
ctx, candidates, last_tokens_data, last_tokens_size, penalty
)
_lib.llama_sample_repetition_penalty.argtypes = [
llama_context_p,
llama_token_data_array_p,
llama_token_p,
c_int,
c_float,
]
_lib.llama_sample_repetition_penalty.restype = None
# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
def llama_sample_frequency_and_presence_penalties(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
last_tokens_data, # type: Array[llama_token]
last_tokens_size: c_int,
alpha_frequency: c_float,
alpha_presence: c_float,
):
return _lib.llama_sample_frequency_and_presence_penalties(
ctx,
candidates,
last_tokens_data,
last_tokens_size,
alpha_frequency,
alpha_presence,
)
_lib.llama_sample_frequency_and_presence_penalties.argtypes = [
llama_context_p,
llama_token_data_array_p,
llama_token_p,
c_int,
c_float,
c_float,
]
_lib.llama_sample_frequency_and_presence_penalties.restype = None
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
def llama_sample_softmax(
ctx: llama_context_p, candidates # type: _Pointer[llama_token_data]
):
return _lib.llama_sample_softmax(ctx, candidates)
_lib.llama_sample_softmax.argtypes = [
llama_context_p,
llama_token_data_array_p,
]
_lib.llama_sample_softmax.restype = None
# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
def llama_sample_top_k(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
k: c_int,
min_keep: c_size_t,
):
return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
_lib.llama_sample_top_k.argtypes = [
llama_context_p,
llama_token_data_array_p,
c_int,
c_size_t,
]
_lib.llama_sample_top_k.restype = None
# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
def llama_sample_top_p(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
p: c_float,
min_keep: c_size_t,
):
return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
_lib.llama_sample_top_p.argtypes = [
llama_context_p,
llama_token_data_array_p,
c_float,
c_size_t,
]
_lib.llama_sample_top_p.restype = None
# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
def llama_sample_tail_free(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
z: c_float,
min_keep: c_size_t,
):
return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
_lib.llama_sample_tail_free.argtypes = [
llama_context_p,
llama_token_data_array_p,
c_float,
c_size_t,
]
_lib.llama_sample_tail_free.restype = None
# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
def llama_sample_typical(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
p: c_float,
min_keep: c_size_t,
):
return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
_lib.llama_sample_typical.argtypes = [
llama_context_p,
llama_token_data_array_p,
c_float,
c_size_t,
]
_lib.llama_sample_typical.restype = None
# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
def llama_sample_temperature(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
temp: c_float,
):
return _lib.llama_sample_temperature(ctx, candidates, temp)
_lib.llama_sample_temperature.argtypes = [
llama_context_p,
llama_token_data_array_p,
c_float,
]
_lib.llama_sample_temperature.restype = None
# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
def llama_sample_token_mirostat(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
tau: c_float,
eta: c_float,
m: c_int,
mu, # type: _Pointer[c_float]
) -> int:
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
_lib.llama_sample_token_mirostat.argtypes = [
llama_context_p,
llama_token_data_array_p,
c_float,
c_float,
c_int,
c_float_p,
]
_lib.llama_sample_token_mirostat.restype = llama_token
# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
def llama_sample_token_mirostat_v2(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
tau: c_float,
eta: c_float,
mu, # type: _Pointer[c_float]
) -> int:
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
_lib.llama_sample_token_mirostat_v2.argtypes = [
llama_context_p,
llama_token_data_array_p,
c_float,
c_float,
c_float_p,
]
_lib.llama_sample_token_mirostat_v2.restype = llama_token
# @details Selects the token with the highest probability.
# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
def llama_sample_token_greedy(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
) -> int:
return _lib.llama_sample_token_greedy(ctx, candidates)
_lib.llama_sample_token_greedy.argtypes = [
llama_context_p,
llama_token_data_array_p,
]
_lib.llama_sample_token_greedy.restype = llama_token
# @details Randomly selects a token from the candidates based on their probabilities.
# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
def llama_sample_token(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]
) -> int:
return _lib.llama_sample_token(ctx, candidates)
_lib.llama_sample_token.argtypes = [
llama_context_p,
llama_token_data_array_p,
]
_lib.llama_sample_token.restype = llama_token
# Performance information
# LLAMA_API void llama_print_timings(struct llama_context * ctx);
def llama_print_timings(ctx: llama_context_p):
_lib.llama_print_timings(ctx)
_lib.llama_print_timings.argtypes = [llama_context_p]
_lib.llama_print_timings.restype = None
# LLAMA_API void llama_reset_timings(struct llama_context * ctx);
def llama_reset_timings(ctx: llama_context_p):
_lib.llama_reset_timings(ctx)
_lib.llama_reset_timings.argtypes = [llama_context_p]
_lib.llama_reset_timings.restype = None
# Print system information
# LLAMA_API const char * llama_print_system_info(void);
def llama_print_system_info() -> bytes:
return _lib.llama_print_system_info()
_lib.llama_print_system_info.argtypes = []
_lib.llama_print_system_info.restype = c_char_p
###################################################################################################
_llama_initialized = False
if not _llama_initialized:
llama_init_backend()
_llama_initialized = True

View file

@ -0,0 +1,573 @@
"""
This is an example implementation of main.cpp from llama.cpp
Quirks:
* Its not exactly alike since this port is designed around programmatic I/O
* Input is always echoed if on, so it should be turned off when using "input()"
* The first antiprompt should be the userprompt like "\nUser:",
because its added when n_predict is reached (aka generation ended prematurely)
* n_predict can be set to -1 for unlimited length responses (or just a really high value)
* Instruction mode adds its own antiprompt.
You should also still be feeding the model with a "primer" prompt that
shows it the expected format.
"""
import ctypes
import sys
from time import time
from os import cpu_count, path
import llama_cpp
from common import GptParams, gpt_params_parse, gpt_random_prompt
ANSI_COLOR_RESET = "\x1b[0m"
ANSI_COLOR_YELLOW = "\x1b[33m"
ANSI_BOLD = "\x1b[1m"
ANSI_COLOR_GREEN = "\x1b[32m"
CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
# Iterative search
# Actively searches and prevents a pattern from being returned
class IterSearch:
def __init__(self, pattern):
self.pattern = list(pattern)
self.buffer = []
def __call__(self, char):
self.buffer += [char]
if (self.pattern[:len(self.buffer)] == self.buffer):
if (len(self.buffer) >= len(self.pattern)):
self.buffer.clear()
return []
_tmp = self.buffer[:]
self.buffer.clear()
return _tmp
# A LLaMA interactive session
class LLaMAInteract:
def __init__(self, params: GptParams) -> None:
# input args
self.params = params
if (self.params.perplexity):
raise NotImplementedError("""************
please use the 'perplexity' tool for perplexity calculations
************""")
if (self.params.embedding):
raise NotImplementedError("""************
please use the 'embedding' tool for embedding calculations
************""")
if (self.params.n_ctx > 2048):
print(f"""warning: model does not support \
context sizes greater than 2048 tokens ({self.params.n_ctx} \
specified) expect poor results""", file=sys.stderr)
if (self.params.seed <= 0):
self.params.seed = int(time())
print(f"seed = {self.params.seed}", file=sys.stderr)
if (self.params.random_prompt):
self.params.prompt = gpt_random_prompt(self.params.seed)
# runtime args
self.input_consumed = 0
self.n_past = 0
self.n_session_consumed = 0
self.first_antiprompt = []
self.remaining_tokens = self.params.n_predict
self.output_echo = self.params.input_echo
# model load
self.lparams = llama_cpp.llama_context_default_params()
self.lparams.n_ctx = self.params.n_ctx
self.lparams.n_parts = self.params.n_parts
self.lparams.seed = self.params.seed
self.lparams.memory_f16 = self.params.memory_f16
self.lparams.use_mlock = self.params.use_mlock
self.lparams.use_mmap = self.params.use_mmap
self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
if (not self.ctx):
raise RuntimeError(f"error: failed to load model '{self.params.model}'")
if (self.params.ignore_eos):
self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
if (len(self.params.lora_adapter) > 0):
if (llama_cpp.llama_apply_lora_from_file(
self.ctx,
self.params.lora_adapter.encode("utf8"),
self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None,
self.params.n_threads
) != 0):
print("error: failed to apply lora adapter")
return
print(file=sys.stderr)
print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
# determine the required inference memory per token:
if (self.params.mem_test):
tmp = [0, 1, 2, 3]
llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
llama_cpp.llama_print_timings(self.ctx)
self.exit()
return
# create internal context
self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
# Add a space in front of the first character to match OG llama tokenizer behavior
self.params.prompt = " " + self.params.prompt
# Load prompt file
if (self.params.file):
with open(self.params.file) as f:
self.params.prompt = f.read()
self.session_tokens: list[llama_cpp.llama_token] = []
if (len(self.params.path_session) > 0):
print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr)
if (path.exists(self.params.path_session)):
_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
_n_token_count_out = llama_cpp.c_size_t()
if (llama_cpp.llama_load_session_file(
self.ctx,
self.params.path_session.encode("utf8"),
_session_tokens,
self.params.n_ctx,
ctypes.byref(_n_token_count_out)
) != 1):
print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
return
_n_token_count_out = _n_token_count_out.value
self.session_tokens = _session_tokens[:_n_token_count_out]
print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
else:
print(f"session file does not exist, will create", file=sys.stderr)
# tokenize the prompt
self.embd = []
self.embd_inp = self._tokenize(self.params.prompt)
if (len(self.embd_inp) > self.n_ctx - 4):
raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
# debug message about similarity of saved session, if applicable
self.n_matching_session_tokens = 0
if len(self.session_tokens) > 0:
for id in self.session_tokens:
if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]:
break
self.n_matching_session_tokens += 1
if self.n_matching_session_tokens >= len(self.embd_inp):
print(f"session file has exact match for prompt!")
elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
else:
print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
# number of tokens to keep when resetting context
if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
self.params.n_keep = len(self.embd_inp)
self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix)
self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
# in instruct mode, we inject a prefix and a suffix to each input by the user
self.antiecho = None
if (self.params.instruct):
self.params.interactive_start = True
_ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
self.first_antiprompt.append(_ptn)
self.antiecho = IterSearch(_ptn)
# enable interactive mode if reverse prompt or interactive start is specified
if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
self.params.interactive = True
# determine newline token
self.llama_token_newline = self._tokenize("\n", False)
self.llama_token_eot = self._tokenize(" [end of text]\n", False)
if (self.params.verbose_prompt):
print(f"""
prompt: '{self.params.prompt}'
number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
for i in range(len(self.embd_inp)):
print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr)
if (self.params.n_keep > 0):
print("static prompt based on n_keep: '")
for i in range(self.params.n_keep):
print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr)
print("'", file=sys.stderr)
print(file=sys.stderr)
if (self.params.interactive):
print("interactive mode on.", file=sys.stderr)
if (len(self.params.antiprompt) > 0):
for antiprompt in self.params.antiprompt:
print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr)
if len(self.params.input_prefix) > 0:
print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
repeat_penalty = {self.params.repeat_penalty},\
presence_penalty = {self.params.presence_penalty},\
frequency_penalty = {self.params.frequency_penalty},\
top_k = {self.params.top_k},\
tfs_z = {self.params.tfs_z},\
top_p = {self.params.top_p},\
typical_p = {self.params.typical_p},\
temp = {self.params.temp},\
mirostat = {self.params.mirostat},\
mirostat_lr = {self.params.mirostat_eta},\
mirostat_ent = {self.params.mirostat_tau},\
generate: n_ctx = {self.n_ctx},\
n_batch = {self.params.n_batch},\
n_predict = {self.params.n_predict},\
n_keep = {self.params.n_keep}
""", file=sys.stderr)
# determine antiprompt tokens
for i in self.params.antiprompt:
self.first_antiprompt.append(self._tokenize(i, False))
self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
if (params.interactive):
print("""== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMa.
- If you want to submit another line, end your input in '\\'.
""", file=sys.stderr)
self.set_color(CONSOLE_COLOR_PROMPT)
# tokenize a prompt
def _tokenize(self, prompt, bos=True):
_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
return _arr[:_n]
def set_color(self, c):
if (self.params.use_color):
print(c, end="")
def use_antiprompt(self):
return len(self.first_antiprompt) > 0
# generate tokens
def generate(self):
while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1:
# predict
if len(self.embd) > 0:
# infinite text generation via context swapping
# if we run out of context:
# - take the n_keep first tokens from the original prompt (via n_past)
# - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
if (self.n_past + len(self.embd) > self.n_ctx):
n_left = self.n_past - self.params.n_keep
self.n_past = self.params.n_keep
# insert n_left/2 tokens at the start of embd from last_n_tokens
_insert = self.last_n_tokens[
self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
]
self.embd = _insert + self.embd
self.params.path_session = ""
# try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
# REVIEW
if self.n_session_consumed < len(self.session_tokens):
for i in range(len(self.embd)):
if self.embd[i] != self.session_tokens[self.n_session_consumed]:
self.session_tokens = self.session_tokens[:self.n_session_consumed]
break
self.n_past += 1
self.n_session_consumed += 1
if self.n_session_consumed >= len(self.session_tokens):
i += 1
break
if i > 0:
self.embd = self.embd[i:]
# evaluate tokens in batches
# embd is typically prepared beforehand to fit within a batch, but not always
#TODO BUG: The batching code causes nonsensical generation
"""for i in range(0, len(self.embd), self.params.n_batch):
n_eval = self.params.n_batch
_arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
print(f"failed to eval")
return
self.n_past += n_eval"""
if (llama_cpp.llama_eval(
self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
) != 0):
raise Exception("Failed to llama_eval!")
if len(self.embd) > 0 and len(self.params.path_session) > 0:
self.session_tokens.extend(self.embd)
self.n_session_consumed = len(self.session_tokens)
self.n_past += len(self.embd)
self.embd = []
if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting
# out of user input, sample next token
top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k
repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n
# optionally save the session on first sample (for faster prompt loading next time)
if len(self.params.path_session) > 0 and self.need_to_save_session:
self.need_to_save_session = False
llama_cpp.llama_save_session_file(
self.ctx,
self.params.path_session.encode("utf8"),
(llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens),
len(self.session_tokens)
)
id = 0
logits = llama_cpp.llama_get_logits(self.ctx)
n_vocab = llama_cpp.llama_n_vocab(self.ctx)
# Apply params.logit_bias map
for key, value in self.params.logit_bias.items():
logits[key] += value
_arr = (llama_cpp.llama_token_data * n_vocab)(*[
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
for token_id in range(n_vocab)
])
candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
# Apply penalties
nl_logit = logits[llama_cpp.llama_token_nl()]
last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
_arr,
last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
_arr,
last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
if not self.params.penalize_nl:
logits[llama_cpp.llama_token_nl()] = nl_logit
if self.params.temp <= 0:
# Greedy sampling
id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
else:
if self.params.mirostat == 1:
mirostat_mu = 2.0 * self.params.mirostat_tau
mirostat_m = 100
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu))
elif self.params.mirostat == 2:
mirostat_mu = 2.0 * self.params.mirostat_tau
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
else:
# Temperature sampling
llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z),min_keep=llama_cpp.c_size_t(1))
llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p),min_keep=llama_cpp.c_size_t(1))
llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p),min_keep=llama_cpp.c_size_t(1))
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
# print("`{}`".format(candidates_p.size))
self.last_n_tokens.pop(0)
self.last_n_tokens.append(id)
# replace end of text token with newline token when in interactive mode
if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
id = self.llama_token_newline[0]
self.embd.append(id)
if (self.use_antiprompt()):
# tokenize and inject first reverse prompt
self.embd_inp += self.first_antiprompt[0]
for id in self.first_antiprompt[0]:
self.embd.append(id)
else:
# add it to the context
self.embd.append(id)
# echo this to console
self.output_echo = True
# decrement remaining sampling budget
self.remaining_tokens -= 1
else:
# output to console if input echo is on
self.output_echo = self.params.input_echo
# some user input remains from prompt or interaction, forward it to processing
while len(self.embd_inp) > self.input_consumed:
self.embd.append(self.embd_inp[self.input_consumed])
self.last_n_tokens.pop(0)
self.last_n_tokens.append(self.embd_inp[self.input_consumed])
self.input_consumed += 1
if len(self.embd) >= self.params.n_batch:
break
# display tokens
if self.output_echo:
for id in self.embd:
if self.antiecho != None:
for r in self.antiecho(id):
yield r
else:
yield id
# reset color to default if we there is no pending user input
if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
self.set_color(CONSOLE_COLOR_DEFAULT)
if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
# if antiprompt is present, stop
if (self.use_antiprompt()):
if True in [
i == self.last_n_tokens[-len(i):]
for i in self.first_antiprompt
]:
break
# if we are using instruction mode, and we have processed the initial prompt
if (self.params.interactive_start):
break
# end of text token
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
if (not self.params.instruct):
for i in self.llama_token_eot:
yield i
break
# respect n_predict even if antiprompt is present
if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
# If we arent in instruction mode, fix the current generation by appending the antiprompt.
# Makes it so if chat ends prematurely you dont append the AI's text etc.
if not self.params.instruct:
self.embd_inp += self.first_antiprompt[0]
self.n_remain = self.params.n_predict
break
self.params.interactive_start = False
def __enter__(self):
return self
def __exit__(self, type, value, tb):
self.exit()
def exit(self):
llama_cpp.llama_free(self.ctx)
self.set_color(CONSOLE_COLOR_DEFAULT)
# return past text
def past(self):
for id in self.last_n_tokens[-self.n_past:]:
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
# write input
def input(self, prompt: str):
if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
self.embd_inp += self.inp_prefix
self.embd_inp += self._tokenize(prompt)
if (self.params.instruct):
self.embd_inp += self.inp_suffix
# write output
def output(self):
self.remaining_tokens = self.params.n_predict
for id in self.generate():
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
# read user input
def read_input(self):
out = ""
while (t := input()).endswith("\\"):
out += t[:-1] + "\n"
return out + t + "\n"
# interactive mode
def interact(self):
for i in self.output():
print(i,end="",flush=True)
self.params.input_echo = False
while self.params.interactive:
self.set_color(CONSOLE_COLOR_USER_INPUT)
if (self.params.instruct):
print('\n> ', end="")
self.input(self.read_input())
else:
print(self.params.input_prefix, end="")
self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
print(self.params.input_suffix,end="")
self.set_color(CONSOLE_COLOR_DEFAULT)
try:
for i in self.output():
print(i,end="",flush=True)
except KeyboardInterrupt:
self.set_color(CONSOLE_COLOR_DEFAULT)
if not self.params.instruct:
print(self.params.fix_prefix,end="")
self.input(self.params.fix_prefix)
if __name__ == "__main__":
from datetime import datetime
USER_NAME="User"
AI_NAME="ChatLLaMa"
time_now = datetime.now()
prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.
{USER_NAME}: Hello, {AI_NAME}!
{AI_NAME}: Hello {USER_NAME}! How may I help you today?
{USER_NAME}: What time is it?
{AI_NAME}: It is {time_now.strftime("%H:%M")}.
{USER_NAME}: What year is it?
{AI_NAME}: We are in {time_now.strftime("%Y")}.
{USER_NAME}: What is a cat?
{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{USER_NAME}: Name a color.
{AI_NAME}: Blue
{USER_NAME}:"""
params = gpt_params_parse()
with LLaMAInteract(params) as m:
m.interact()

View file

@ -0,0 +1,102 @@
import llama_cpp
import multiprocessing
import llama_cpp
N_THREADS = multiprocessing.cpu_count()
prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
lparams = llama_cpp.llama_context_default_params()
ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams)
# determine the required inference memory per token:
tmp = [0, 1, 2, 3]
llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS)
n_past = 0
prompt = b" " + prompt
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True)
embd_inp = embd_inp[:n_of_tok]
n_ctx = llama_cpp.llama_n_ctx(ctx)
n_predict = 20
n_predict = min(n_predict, n_ctx - len(embd_inp))
input_consumed = 0
input_noecho = False
remaining_tokens = n_predict
embd = []
last_n_size = 64
last_n_tokens_data = [0] * last_n_size
n_batch = 24
last_n_repeat = 64
repeat_penalty = 1
frequency_penalty = 0.0
presence_penalty = 0.0
while remaining_tokens > 0:
if len(embd) > 0:
llama_cpp.llama_eval(
ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS
)
n_past += len(embd)
embd = []
if len(embd_inp) <= input_consumed:
logits = llama_cpp.llama_get_logits(ctx)
n_vocab = llama_cpp.llama_n_vocab(ctx)
_arr = (llama_cpp.llama_token_data * n_vocab)(*[
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
for token_id in range(n_vocab)
])
candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
_arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
_arr,
last_n_repeat, repeat_penalty)
llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
_arr,
last_n_repeat, frequency_penalty, presence_penalty)
llama_cpp.llama_sample_top_k(ctx, candidates_p, 40, min_keep=llama_cpp.c_size_t(1))
llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, min_keep=llama_cpp.c_size_t(1))
llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
id = llama_cpp.llama_sample_token(ctx, candidates_p)
last_n_tokens_data = last_n_tokens_data[1:] + [id]
embd.append(id)
input_noecho = False
remaining_tokens -= 1
else:
while len(embd_inp) > input_consumed:
embd.append(embd_inp[input_consumed])
last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
input_consumed += 1
if len(embd) >= n_batch:
break
if not input_noecho:
for id in embd:
print(
llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
end="",
flush=True,
)
if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos():
break
print()
llama_cpp.llama_print_timings(ctx)
llama_cpp.llama_free(ctx)

25
examples/quantize.py Normal file
View file

@ -0,0 +1,25 @@
import os
import argparse
import llama_cpp
def main(args):
if not os.path.exists(fname_inp):
raise RuntimeError(f"Input file does not exist ({fname_inp})")
if os.path.exists(fname_out):
raise RuntimeError(f"Output file already exists ({fname_out})")
fname_inp = args.fname_inp.encode("utf-8")
fname_out = args.fname_out.encode("utf-8")
itype = args.itype
return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
if return_code != 0:
raise RuntimeError("Failed to quantize model")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fname_inp", type=str, help="Path to input model")
parser.add_argument("fname_out", type=str, help="Path to output model")
parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
args = parser.parse_args()
main(args)