Merge 93278f84cf
into e6e6583199
This commit is contained in:
commit
495c32cbe4
8 changed files with 1943 additions and 0 deletions
71
examples/Chat.py
Normal file
71
examples/Chat.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
#!/bin/python
|
||||
import sys, os, datetime
|
||||
from common import GptParams
|
||||
from low_level_api_chat_cpp import LLaMAInteract
|
||||
|
||||
def env_or_def(env, default):
|
||||
if (env in os.environ):
|
||||
return os.environ[env]
|
||||
return default
|
||||
|
||||
AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
|
||||
MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
|
||||
USER_NAME = env_or_def("USER_NAME", "USER")
|
||||
N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
|
||||
N_THREAD = int(env_or_def("N_THREAD", "8"))
|
||||
|
||||
today = datetime.datetime.today()
|
||||
DATE_YEAR=today.strftime("%Y")
|
||||
DATE_TIME=today.strftime("%H:%M")
|
||||
|
||||
prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
|
||||
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
|
||||
There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
|
||||
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
|
||||
The transcript only includes text, it does not include markup like HTML and Markdown.
|
||||
|
||||
{USER_NAME}: Hello, {AI_NAME}!
|
||||
{AI_NAME}: Hello {USER_NAME}! How may I help you today?
|
||||
{USER_NAME}: What year is it?
|
||||
{AI_NAME}: We are in {DATE_YEAR}.
|
||||
{USER_NAME}: Please tell me the largest city in Europe.
|
||||
{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
|
||||
{USER_NAME}: What can you tell me about Moscow?
|
||||
{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
|
||||
{USER_NAME}: What is a cat?
|
||||
{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
|
||||
{USER_NAME}: How do I pass command line arguments to a Node.js program?
|
||||
{AI_NAME}: The arguments are stored in process.argv.
|
||||
|
||||
argv[0] is the path to the Node. js executable.
|
||||
argv[1] is the path to the script file.
|
||||
argv[2] is the first argument passed to the script.
|
||||
argv[3] is the second argument passed to the script and so on.
|
||||
{USER_NAME}: Name a color.
|
||||
{AI_NAME}: Blue.
|
||||
{USER_NAME}: What time is it?
|
||||
{AI_NAME}: It is {DATE_TIME}.
|
||||
{USER_NAME}:""" + " ".join(sys.argv[1:])
|
||||
|
||||
print("Loading model...")
|
||||
params = GptParams(
|
||||
n_ctx=2048,
|
||||
temp=0.7,
|
||||
top_k=40,
|
||||
top_p=0.5,
|
||||
repeat_last_n=256,
|
||||
n_batch=1024,
|
||||
repeat_penalty=1.17647,
|
||||
model=MODEL,
|
||||
n_threads=N_THREAD,
|
||||
n_predict=N_PREDICTS,
|
||||
use_color=True,
|
||||
interactive=True,
|
||||
antiprompt=[f"{USER_NAME}:"],
|
||||
input_prefix=" ",
|
||||
input_suffix=f"{AI_NAME}:",
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
with LLaMAInteract(params) as m:
|
||||
m.interact()
|
59
examples/Miku.py
Normal file
59
examples/Miku.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
#!/bin/python
|
||||
import sys, os
|
||||
from common import GptParams
|
||||
from low_level_api_chat_cpp import LLaMAInteract
|
||||
|
||||
def env_or_def(env, default):
|
||||
if (env in os.environ):
|
||||
return os.environ[env]
|
||||
return default
|
||||
|
||||
AI_NAME = env_or_def("AI_NAME", "Miku")
|
||||
MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
|
||||
USER_NAME = env_or_def("USER_NAME", "Anon")
|
||||
N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
|
||||
N_THREAD = int(env_or_def("N_THREAD", "0"))
|
||||
|
||||
prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
|
||||
{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||
{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
||||
{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
|
||||
{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
|
||||
The conversation is only between {USER_NAME} and {AI_NAME}
|
||||
The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
|
||||
{AI_NAME} can only communicate through text, so she can't send images or videos.
|
||||
|
||||
|
||||
{USER_NAME}: Hello!
|
||||
{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
|
||||
{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
|
||||
{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
|
||||
{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
|
||||
{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
|
||||
{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
|
||||
{AI_NAME}: What do you like to do in your free time? ^_^
|
||||
{USER_NAME}:""" + " ".join(sys.argv[1:])
|
||||
|
||||
print("Loading model...")
|
||||
params = GptParams(
|
||||
n_batch=1024,
|
||||
n_ctx=2048,
|
||||
n_keep=-1,
|
||||
repeat_last_n=256,
|
||||
repeat_penalty=1.17647,
|
||||
temp=0.7,
|
||||
top_k=40,
|
||||
top_p=0.5,
|
||||
model=MODEL,
|
||||
n_predict=N_PREDICTS,
|
||||
use_color=True,
|
||||
interactive=True,
|
||||
antiprompt=[f"{USER_NAME}:"],
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
if N_THREAD > 0:
|
||||
params.n_threads = N_THREAD
|
||||
|
||||
with LLaMAInteract(params) as m:
|
||||
m.interact()
|
49
examples/ReasonAct.py
Normal file
49
examples/ReasonAct.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
#!/bin/python
|
||||
import sys, os, datetime
|
||||
from common import GptParams
|
||||
from low_level_api_chat_cpp import LLaMAInteract
|
||||
|
||||
def env_or_def(env, default):
|
||||
if (env in os.environ):
|
||||
return os.environ[env]
|
||||
return default
|
||||
|
||||
MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
|
||||
|
||||
prompt=f"""You run in a loop of Thought, Action, Observation.
|
||||
At the end of the loop either Answer or restate your Thought and Action.
|
||||
Use Thought to describe your thoughts about the question you have been asked.
|
||||
Use Action to run one of these actions available to you:
|
||||
- calculate[python math expression]
|
||||
Observation will be the result of running those actions
|
||||
|
||||
|
||||
Question: What is 4 * 7 / 3?
|
||||
Thought: Do I need to use an action? Yes, I use calculate to do math
|
||||
Action: calculate[4 * 7 / 3]
|
||||
Observation: 9.3333333333
|
||||
Thought: Do I need to use an action? No, have the result
|
||||
Answer: The calculate tool says it is 9.3333333333
|
||||
Question: What is capital of france?
|
||||
Thought: Do I need to use an action? No, I know the answer
|
||||
Answer: Paris is the capital of France
|
||||
Question:""" + " ".join(sys.argv[1:])
|
||||
|
||||
print("Loading model...")
|
||||
params = GptParams(
|
||||
interactive=True,
|
||||
interactive_start=True,
|
||||
top_k=10000,
|
||||
temp=0.2,
|
||||
repeat_penalty=1,
|
||||
n_threads=7,
|
||||
n_ctx=2048,
|
||||
antiprompt=["Question:","Observation:"],
|
||||
model=MODEL,
|
||||
input_prefix=" ",
|
||||
n_predict=-1,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
with LLaMAInteract(params) as m:
|
||||
m.interact()
|
202
examples/common.py
Normal file
202
examples/common.py
Normal file
|
@ -0,0 +1,202 @@
|
|||
import os
|
||||
import argparse
|
||||
import re
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
|
||||
|
||||
|
||||
@dataclass
|
||||
class GptParams:
|
||||
seed: int = -1
|
||||
n_threads: int = min(4, os.cpu_count() or 1)
|
||||
n_predict: int = 128
|
||||
n_parts: int = -1
|
||||
n_ctx: int = 512
|
||||
n_batch: int = 8
|
||||
n_keep: int = 0
|
||||
|
||||
ignore_eos: bool = False
|
||||
logit_bias: dict[int, float] = field(default_factory=dict)
|
||||
top_k: int = 40
|
||||
top_p: float = 0.95
|
||||
tfs_z: float = 1.00
|
||||
typical_p: float = 1.00
|
||||
temp: float = 0.80
|
||||
repeat_penalty: float = 1.10
|
||||
repeat_last_n: int = 64
|
||||
frequency_penalty: float = 0.0
|
||||
presence_penalty: float = 0.0
|
||||
mirostat: int = 0
|
||||
mirostat_tau: float = 5.0
|
||||
mirostat_eta: float = 0.1
|
||||
|
||||
model: str = "./models/llama-7B/ggml-model.bin"
|
||||
prompt: str = ""
|
||||
path_session: str = ""
|
||||
input_prefix: str = " "
|
||||
input_suffix: str = ""
|
||||
antiprompt: List[str] = field(default_factory=list)
|
||||
|
||||
lora_adapter: str = ""
|
||||
lora_base: str = ""
|
||||
|
||||
memory_f16: bool = True
|
||||
random_prompt: bool = False
|
||||
use_color: bool = False
|
||||
interactive: bool = False
|
||||
|
||||
embedding: bool = False
|
||||
interactive_start: bool = False
|
||||
|
||||
instruct: bool = False
|
||||
penalize_nl: bool = True
|
||||
perplexity: bool = False
|
||||
use_mmap: bool = True
|
||||
use_mlock: bool = False
|
||||
mem_test: bool = False
|
||||
verbose_prompt: bool = False
|
||||
|
||||
file: str = None
|
||||
|
||||
# If chat ended prematurely, append this to the conversation to fix it.
|
||||
# Set to "\nUser:" etc.
|
||||
# This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
|
||||
fix_prefix: str = ""
|
||||
input_echo: bool = True,
|
||||
|
||||
# Default instructions for Alpaca
|
||||
# switch to "Human" and "Assistant" for Vicuna.
|
||||
# TODO: TBD how they are gonna handle this upstream
|
||||
instruct_inp_prefix: str="\n\n### Instruction:\n\n"
|
||||
instruct_inp_suffix: str="\n\n### Response:\n\n"
|
||||
|
||||
|
||||
def gpt_params_parse(argv = None):
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
|
||||
parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
|
||||
parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
|
||||
parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
|
||||
parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
|
||||
parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
|
||||
parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
|
||||
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--logit-bias",
|
||||
type=str,
|
||||
action='append',
|
||||
help="--logit-bias TOKEN_ID(+/-)BIAS",
|
||||
dest="logit_bias_str"
|
||||
)
|
||||
parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
|
||||
parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
|
||||
parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
|
||||
parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z")
|
||||
parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
|
||||
parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
|
||||
parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
|
||||
parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
|
||||
parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
|
||||
parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
|
||||
parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau")
|
||||
parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
|
||||
|
||||
parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
|
||||
parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
|
||||
parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
|
||||
parser.add_argument("--session", type=str, default="", help="file to cache model state in (may be large!)",dest="path_session")
|
||||
parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
|
||||
parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--reverse-prompt",
|
||||
type=str,
|
||||
action='append',
|
||||
help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
|
||||
dest="antiprompt"
|
||||
)
|
||||
|
||||
parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter")
|
||||
parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base")
|
||||
|
||||
parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
|
||||
parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
|
||||
parser.add_argument(
|
||||
"--color",
|
||||
action="store_true",
|
||||
help="colorise output to distinguish prompt and user input from generations",
|
||||
dest="use_color"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
|
||||
)
|
||||
|
||||
parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
|
||||
parser.add_argument(
|
||||
"--interactive-first",
|
||||
action="store_true",
|
||||
help="run in interactive mode and wait for input right away",
|
||||
dest="interactive_start"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-ins",
|
||||
"--instruct",
|
||||
action="store_true",
|
||||
help="run in instruction mode (use with Alpaca or Vicuna models)",
|
||||
dest="instruct"
|
||||
)
|
||||
parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl")
|
||||
parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
|
||||
parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
|
||||
parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
|
||||
parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
|
||||
parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
|
||||
|
||||
#Custom args
|
||||
parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
|
||||
parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
|
||||
|
||||
parser.add_argument(
|
||||
"--interactive-start",
|
||||
action="store_true",
|
||||
help="run in interactive mode",
|
||||
dest="interactive"
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
logit_bias_str = args.logit_bias_str
|
||||
delattr(args, "logit_bias_str")
|
||||
params = GptParams(**vars(args))
|
||||
|
||||
if (params.lora_adapter):
|
||||
params.use_mmap = False
|
||||
|
||||
if (logit_bias_str != None):
|
||||
for i in logit_bias_str:
|
||||
if (m := re.match(r"(\d+)([-+]\d+)", i)):
|
||||
params.logit_bias[int(m.group(1))] = float(m.group(2))
|
||||
|
||||
return params
|
||||
|
||||
def gpt_random_prompt(rng):
|
||||
return [
|
||||
"So",
|
||||
"Once upon a time",
|
||||
"When",
|
||||
"The",
|
||||
"After",
|
||||
"If",
|
||||
"import",
|
||||
"He",
|
||||
"She",
|
||||
"They",
|
||||
][rng % 10]
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(gpt_params_parse())
|
862
examples/llama_cpp.py
Normal file
862
examples/llama_cpp.py
Normal file
|
@ -0,0 +1,862 @@
|
|||
import sys
|
||||
import os
|
||||
import ctypes
|
||||
from ctypes import (
|
||||
c_int,
|
||||
c_float,
|
||||
c_char_p,
|
||||
c_void_p,
|
||||
c_bool,
|
||||
POINTER,
|
||||
_Pointer, # type: ignore
|
||||
Structure,
|
||||
Array,
|
||||
c_uint8,
|
||||
c_size_t,
|
||||
)
|
||||
import pathlib
|
||||
|
||||
|
||||
# Load the library
|
||||
def _load_shared_library(lib_base_name: str):
|
||||
# Determine the file extension based on the platform
|
||||
if sys.platform.startswith("linux"):
|
||||
lib_ext = ".so"
|
||||
elif sys.platform == "darwin":
|
||||
lib_ext = ".dylib"
|
||||
elif sys.platform == "win32":
|
||||
lib_ext = ".dll"
|
||||
else:
|
||||
raise RuntimeError("Unsupported platform")
|
||||
|
||||
# Construct the paths to the possible shared library names
|
||||
_base_path = pathlib.Path(__file__).parent.resolve()
|
||||
_base_path_parent = pathlib.Path(__file__).parent.parent.resolve()
|
||||
# Searching for the library in the current directory under the name "libllama" (default name
|
||||
# for llamacpp) and "llama" (default name for this repo)
|
||||
_lib_paths = [
|
||||
_base_path / f"lib{lib_base_name}{lib_ext}",
|
||||
_base_path_parent / f"lib{lib_base_name}{lib_ext}",
|
||||
_base_path / f"{lib_base_name}{lib_ext}",
|
||||
]
|
||||
|
||||
if "LLAMA_CPP_LIB" in os.environ:
|
||||
lib_base_name = os.environ["LLAMA_CPP_LIB"]
|
||||
_lib = pathlib.Path(lib_base_name)
|
||||
_base_path = _lib.parent.resolve()
|
||||
_lib_paths = [_lib.resolve()]
|
||||
|
||||
cdll_args = dict() # type: ignore
|
||||
# Add the library directory to the DLL search path on Windows (if needed)
|
||||
if sys.platform == "win32" and sys.version_info >= (3, 8):
|
||||
os.add_dll_directory(str(_base_path))
|
||||
if "CUDA_PATH" in os.environ:
|
||||
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
|
||||
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
|
||||
cdll_args["winmode"] = 0
|
||||
|
||||
# Try to load the shared library, handling potential errors
|
||||
for _lib_path in _lib_paths:
|
||||
if _lib_path.exists():
|
||||
try:
|
||||
return ctypes.CDLL(str(_lib_path), **cdll_args)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
|
||||
|
||||
raise FileNotFoundError(
|
||||
f"Shared library with base name '{lib_base_name}' not found"
|
||||
)
|
||||
|
||||
|
||||
# Specify the base name of the shared library to load
|
||||
_lib_base_name = "llama"
|
||||
|
||||
# Load the library
|
||||
_lib = _load_shared_library(_lib_base_name)
|
||||
|
||||
# Misc
|
||||
c_float_p = POINTER(c_float)
|
||||
c_uint8_p = POINTER(c_uint8)
|
||||
c_size_t_p = POINTER(c_size_t)
|
||||
|
||||
# llama.h bindings
|
||||
|
||||
# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
|
||||
# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
|
||||
# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
|
||||
# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
|
||||
# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
|
||||
|
||||
# #define LLAMA_FILE_VERSION 3
|
||||
LLAMA_FILE_VERSION = c_int(3)
|
||||
LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
|
||||
LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
|
||||
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
||||
LLAMA_SESSION_VERSION = c_int(1)
|
||||
|
||||
# struct llama_context;
|
||||
llama_context_p = c_void_p
|
||||
|
||||
|
||||
# typedef int llama_token;
|
||||
llama_token = c_int
|
||||
llama_token_p = POINTER(llama_token)
|
||||
|
||||
|
||||
# typedef struct llama_token_data {
|
||||
# llama_token id; // token id
|
||||
# float logit; // log-odds of the token
|
||||
# float p; // probability of the token
|
||||
# } llama_token_data;
|
||||
class llama_token_data(Structure):
|
||||
_fields_ = [
|
||||
("id", llama_token),
|
||||
("logit", c_float),
|
||||
("p", c_float),
|
||||
]
|
||||
|
||||
|
||||
llama_token_data_p = POINTER(llama_token_data)
|
||||
|
||||
|
||||
# typedef struct llama_token_data_array {
|
||||
# llama_token_data * data;
|
||||
# size_t size;
|
||||
# bool sorted;
|
||||
# } llama_token_data_array;
|
||||
class llama_token_data_array(Structure):
|
||||
_fields_ = [
|
||||
("data", llama_token_data_p),
|
||||
("size", c_size_t),
|
||||
("sorted", c_bool),
|
||||
]
|
||||
|
||||
|
||||
llama_token_data_array_p = POINTER(llama_token_data_array)
|
||||
|
||||
# typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
|
||||
|
||||
|
||||
# struct llama_context_params {
|
||||
# int n_ctx; // text context
|
||||
# int n_gpu_layers; // number of layers to store in VRAM
|
||||
# int seed; // RNG seed, -1 for random
|
||||
|
||||
# bool f16_kv; // use fp16 for KV cache
|
||||
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
# bool vocab_only; // only load the vocabulary, no weights
|
||||
# bool use_mmap; // use mmap if possible
|
||||
# bool use_mlock; // force system to keep model in RAM
|
||||
# bool embedding; // embedding mode only
|
||||
|
||||
|
||||
# // called with a progress value between 0 and 1, pass NULL to disable
|
||||
# llama_progress_callback progress_callback;
|
||||
# // context pointer passed to the progress callback
|
||||
# void * progress_callback_user_data;
|
||||
# };
|
||||
class llama_context_params(Structure):
|
||||
_fields_ = [
|
||||
("n_ctx", c_int),
|
||||
("n_gpu_layers", c_int),
|
||||
("seed", c_int),
|
||||
("f16_kv", c_bool),
|
||||
(
|
||||
"logits_all",
|
||||
c_bool,
|
||||
),
|
||||
("vocab_only", c_bool),
|
||||
("use_mmap", c_bool),
|
||||
("use_mlock", c_bool),
|
||||
("embedding", c_bool),
|
||||
("progress_callback", llama_progress_callback),
|
||||
("progress_callback_user_data", c_void_p),
|
||||
]
|
||||
|
||||
|
||||
llama_context_params_p = POINTER(llama_context_params)
|
||||
|
||||
# enum llama_ftype {
|
||||
# LLAMA_FTYPE_ALL_F32 = 0,
|
||||
# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
# };
|
||||
LLAMA_FTYPE_ALL_F32 = c_int(0)
|
||||
LLAMA_FTYPE_MOSTLY_F16 = c_int(1)
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)
|
||||
|
||||
|
||||
# LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
def llama_context_default_params() -> llama_context_params:
|
||||
return _lib.llama_context_default_params()
|
||||
|
||||
|
||||
_lib.llama_context_default_params.argtypes = []
|
||||
_lib.llama_context_default_params.restype = llama_context_params
|
||||
|
||||
|
||||
# LLAMA_API bool llama_mmap_supported();
|
||||
def llama_mmap_supported() -> bool:
|
||||
return _lib.llama_mmap_supported()
|
||||
|
||||
|
||||
_lib.llama_mmap_supported.argtypes = []
|
||||
_lib.llama_mmap_supported.restype = c_bool
|
||||
|
||||
|
||||
# LLAMA_API bool llama_mlock_supported();
|
||||
def llama_mlock_supported() -> bool:
|
||||
return _lib.llama_mlock_supported()
|
||||
|
||||
|
||||
_lib.llama_mlock_supported.argtypes = []
|
||||
_lib.llama_mlock_supported.restype = c_bool
|
||||
|
||||
|
||||
# // TODO: not great API - very likely to change
|
||||
# // Initialize the llama + ggml backend
|
||||
# // Call once at the start of the program
|
||||
# LLAMA_API void llama_init_backend();
|
||||
def llama_init_backend():
|
||||
return _lib.llama_init_backend()
|
||||
|
||||
|
||||
_lib.llama_init_backend.argtypes = []
|
||||
_lib.llama_init_backend.restype = None
|
||||
|
||||
|
||||
# LLAMA_API int64_t llama_time_us();
|
||||
def llama_time_us() -> int:
|
||||
return _lib.llama_time_us()
|
||||
|
||||
|
||||
_lib.llama_time_us.argtypes = []
|
||||
_lib.llama_time_us.restype = ctypes.c_int64
|
||||
|
||||
|
||||
# // Various functions for loading a ggml llama model.
|
||||
# // Allocate (almost) all memory needed for the model.
|
||||
# // Return NULL on failure
|
||||
# LLAMA_API struct llama_context * llama_init_from_file(
|
||||
# const char * path_model,
|
||||
# struct llama_context_params params);
|
||||
def llama_init_from_file(
|
||||
path_model: bytes, params: llama_context_params
|
||||
) -> llama_context_p:
|
||||
return _lib.llama_init_from_file(path_model, params)
|
||||
|
||||
|
||||
_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
|
||||
_lib.llama_init_from_file.restype = llama_context_p
|
||||
|
||||
|
||||
# Frees all allocated memory
|
||||
# LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
def llama_free(ctx: llama_context_p):
|
||||
return _lib.llama_free(ctx)
|
||||
|
||||
|
||||
_lib.llama_free.argtypes = [llama_context_p]
|
||||
_lib.llama_free.restype = None
|
||||
|
||||
|
||||
# TODO: not great API - very likely to change
|
||||
# Returns 0 on success
|
||||
# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
||||
# LLAMA_API int llama_model_quantize(
|
||||
# const char * fname_inp,
|
||||
# const char * fname_out,
|
||||
# enum llama_ftype ftype,
|
||||
# int nthread);
|
||||
def llama_model_quantize(
|
||||
fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
|
||||
) -> int:
|
||||
return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
|
||||
|
||||
|
||||
_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
|
||||
_lib.llama_model_quantize.restype = c_int
|
||||
|
||||
|
||||
# Apply a LoRA adapter to a loaded model
|
||||
# path_base_model is the path to a higher quality model to use as a base for
|
||||
# the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
# The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
# will be applied on top of the previous one
|
||||
# Returns 0 on success
|
||||
# LLAMA_API int llama_apply_lora_from_file(
|
||||
# struct llama_context * ctx,
|
||||
# const char * path_lora,
|
||||
# const char * path_base_model,
|
||||
# int n_threads);
|
||||
def llama_apply_lora_from_file(
|
||||
ctx: llama_context_p,
|
||||
path_lora: c_char_p,
|
||||
path_base_model: c_char_p,
|
||||
n_threads: c_int,
|
||||
) -> int:
|
||||
return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
|
||||
|
||||
|
||||
_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
|
||||
_lib.llama_apply_lora_from_file.restype = c_int
|
||||
|
||||
|
||||
# Returns the number of tokens in the KV cache
|
||||
# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_get_kv_cache_token_count(ctx)
|
||||
|
||||
|
||||
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
|
||||
_lib.llama_get_kv_cache_token_count.restype = c_int
|
||||
|
||||
|
||||
# Sets the current rng seed.
|
||||
# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
||||
def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
|
||||
return _lib.llama_set_rng_seed(ctx, seed)
|
||||
|
||||
|
||||
_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
|
||||
_lib.llama_set_rng_seed.restype = None
|
||||
|
||||
|
||||
# Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||
# and kv_cache) - will often be smaller after compacting tokens
|
||||
# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
||||
def llama_get_state_size(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_get_state_size(ctx)
|
||||
|
||||
|
||||
_lib.llama_get_state_size.argtypes = [llama_context_p]
|
||||
_lib.llama_get_state_size.restype = c_size_t
|
||||
|
||||
|
||||
# Copies the state to the specified destination address.
|
||||
# Destination needs to have allocated enough memory.
|
||||
# Returns the number of bytes copied
|
||||
# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
||||
def llama_copy_state_data(
|
||||
ctx: llama_context_p, dst # type: Array[c_uint8]
|
||||
) -> int:
|
||||
return _lib.llama_copy_state_data(ctx, dst)
|
||||
|
||||
|
||||
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
|
||||
_lib.llama_copy_state_data.restype = c_size_t
|
||||
|
||||
|
||||
# Set the state reading from the specified address
|
||||
# Returns the number of bytes read
|
||||
# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
||||
def llama_set_state_data(
|
||||
ctx: llama_context_p, src # type: Array[c_uint8]
|
||||
) -> int:
|
||||
return _lib.llama_set_state_data(ctx, src)
|
||||
|
||||
|
||||
_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p]
|
||||
_lib.llama_set_state_data.restype = c_size_t
|
||||
|
||||
|
||||
# Save/load session file
|
||||
# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
||||
def llama_load_session_file(
|
||||
ctx: llama_context_p,
|
||||
path_session: bytes,
|
||||
tokens_out, # type: Array[llama_token]
|
||||
n_token_capacity: c_size_t,
|
||||
n_token_count_out, # type: _Pointer[c_size_t]
|
||||
) -> int:
|
||||
return _lib.llama_load_session_file(
|
||||
ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
|
||||
)
|
||||
|
||||
|
||||
_lib.llama_load_session_file.argtypes = [
|
||||
llama_context_p,
|
||||
c_char_p,
|
||||
llama_token_p,
|
||||
c_size_t,
|
||||
c_size_t_p,
|
||||
]
|
||||
_lib.llama_load_session_file.restype = c_size_t
|
||||
|
||||
|
||||
# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
||||
def llama_save_session_file(
|
||||
ctx: llama_context_p,
|
||||
path_session: bytes,
|
||||
tokens, # type: Array[llama_token]
|
||||
n_token_count: c_size_t,
|
||||
) -> int:
|
||||
return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
|
||||
|
||||
|
||||
_lib.llama_save_session_file.argtypes = [
|
||||
llama_context_p,
|
||||
c_char_p,
|
||||
llama_token_p,
|
||||
c_size_t,
|
||||
]
|
||||
_lib.llama_save_session_file.restype = c_size_t
|
||||
|
||||
|
||||
# Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
# tokens + n_tokens is the provided batch of new tokens to process
|
||||
# n_past is the number of tokens to use from previous eval calls
|
||||
# Returns 0 on success
|
||||
# LLAMA_API int llama_eval(
|
||||
# struct llama_context * ctx,
|
||||
# const llama_token * tokens,
|
||||
# int n_tokens,
|
||||
# int n_past,
|
||||
# int n_threads);
|
||||
def llama_eval(
|
||||
ctx: llama_context_p,
|
||||
tokens, # type: Array[llama_token]
|
||||
n_tokens: c_int,
|
||||
n_past: c_int,
|
||||
n_threads: c_int,
|
||||
) -> int:
|
||||
return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
|
||||
|
||||
|
||||
_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
|
||||
_lib.llama_eval.restype = c_int
|
||||
|
||||
|
||||
# Convert the provided text into tokens.
|
||||
# The tokens pointer must be large enough to hold the resulting tokens.
|
||||
# Returns the number of tokens on success, no more than n_max_tokens
|
||||
# Returns a negative number on failure - the number of tokens that would have been returned
|
||||
# TODO: not sure if correct
|
||||
# LLAMA_API int llama_tokenize(
|
||||
# struct llama_context * ctx,
|
||||
# const char * text,
|
||||
# llama_token * tokens,
|
||||
# int n_max_tokens,
|
||||
# bool add_bos);
|
||||
def llama_tokenize(
|
||||
ctx: llama_context_p,
|
||||
text: bytes,
|
||||
tokens, # type: Array[llama_token]
|
||||
n_max_tokens: c_int,
|
||||
add_bos: c_bool,
|
||||
) -> int:
|
||||
return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
|
||||
|
||||
|
||||
_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
|
||||
_lib.llama_tokenize.restype = c_int
|
||||
|
||||
|
||||
# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||
def llama_n_vocab(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_n_vocab(ctx)
|
||||
|
||||
|
||||
_lib.llama_n_vocab.argtypes = [llama_context_p]
|
||||
_lib.llama_n_vocab.restype = c_int
|
||||
|
||||
|
||||
# LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
def llama_n_ctx(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_n_ctx(ctx)
|
||||
|
||||
|
||||
_lib.llama_n_ctx.argtypes = [llama_context_p]
|
||||
_lib.llama_n_ctx.restype = c_int
|
||||
|
||||
|
||||
# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
def llama_n_embd(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_n_embd(ctx)
|
||||
|
||||
|
||||
_lib.llama_n_embd.argtypes = [llama_context_p]
|
||||
_lib.llama_n_embd.restype = c_int
|
||||
|
||||
|
||||
# Token logits obtained from the last call to llama_eval()
|
||||
# The logits for the last token are stored in the last row
|
||||
# Can be mutated in order to change the probabilities of the next token
|
||||
# Rows: n_tokens
|
||||
# Cols: n_vocab
|
||||
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
def llama_get_logits(
|
||||
ctx: llama_context_p,
|
||||
): # type: (...) -> Array[float] # type: ignore
|
||||
return _lib.llama_get_logits(ctx)
|
||||
|
||||
|
||||
_lib.llama_get_logits.argtypes = [llama_context_p]
|
||||
_lib.llama_get_logits.restype = c_float_p
|
||||
|
||||
|
||||
# Get the embeddings for the input
|
||||
# shape: [n_embd] (1-dimensional)
|
||||
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
def llama_get_embeddings(
|
||||
ctx: llama_context_p,
|
||||
): # type: (...) -> Array[float] # type: ignore
|
||||
return _lib.llama_get_embeddings(ctx)
|
||||
|
||||
|
||||
_lib.llama_get_embeddings.argtypes = [llama_context_p]
|
||||
_lib.llama_get_embeddings.restype = c_float_p
|
||||
|
||||
|
||||
# Token Id -> String. Uses the vocabulary in the provided context
|
||||
# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||
def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
|
||||
return _lib.llama_token_to_str(ctx, token)
|
||||
|
||||
|
||||
_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
|
||||
_lib.llama_token_to_str.restype = c_char_p
|
||||
|
||||
# Special tokens
|
||||
|
||||
|
||||
# LLAMA_API llama_token llama_token_bos();
|
||||
def llama_token_bos() -> int:
|
||||
return _lib.llama_token_bos()
|
||||
|
||||
|
||||
_lib.llama_token_bos.argtypes = []
|
||||
_lib.llama_token_bos.restype = llama_token
|
||||
|
||||
|
||||
# LLAMA_API llama_token llama_token_eos();
|
||||
def llama_token_eos() -> int:
|
||||
return _lib.llama_token_eos()
|
||||
|
||||
|
||||
_lib.llama_token_eos.argtypes = []
|
||||
_lib.llama_token_eos.restype = llama_token
|
||||
|
||||
|
||||
# LLAMA_API llama_token llama_token_nl();
|
||||
def llama_token_nl() -> int:
|
||||
return _lib.llama_token_nl()
|
||||
|
||||
|
||||
_lib.llama_token_nl.argtypes = []
|
||||
_lib.llama_token_nl.restype = llama_token
|
||||
|
||||
|
||||
# Sampling functions
|
||||
|
||||
|
||||
# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||
# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
||||
def llama_sample_repetition_penalty(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
last_tokens_data, # type: Array[llama_token]
|
||||
last_tokens_size: c_int,
|
||||
penalty: c_float,
|
||||
):
|
||||
return _lib.llama_sample_repetition_penalty(
|
||||
ctx, candidates, last_tokens_data, last_tokens_size, penalty
|
||||
)
|
||||
|
||||
|
||||
_lib.llama_sample_repetition_penalty.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
llama_token_p,
|
||||
c_int,
|
||||
c_float,
|
||||
]
|
||||
_lib.llama_sample_repetition_penalty.restype = None
|
||||
|
||||
|
||||
# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||
# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
||||
def llama_sample_frequency_and_presence_penalties(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
last_tokens_data, # type: Array[llama_token]
|
||||
last_tokens_size: c_int,
|
||||
alpha_frequency: c_float,
|
||||
alpha_presence: c_float,
|
||||
):
|
||||
return _lib.llama_sample_frequency_and_presence_penalties(
|
||||
ctx,
|
||||
candidates,
|
||||
last_tokens_data,
|
||||
last_tokens_size,
|
||||
alpha_frequency,
|
||||
alpha_presence,
|
||||
)
|
||||
|
||||
|
||||
_lib.llama_sample_frequency_and_presence_penalties.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
llama_token_p,
|
||||
c_int,
|
||||
c_float,
|
||||
c_float,
|
||||
]
|
||||
_lib.llama_sample_frequency_and_presence_penalties.restype = None
|
||||
|
||||
|
||||
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
def llama_sample_softmax(
|
||||
ctx: llama_context_p, candidates # type: _Pointer[llama_token_data]
|
||||
):
|
||||
return _lib.llama_sample_softmax(ctx, candidates)
|
||||
|
||||
|
||||
_lib.llama_sample_softmax.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
]
|
||||
_lib.llama_sample_softmax.restype = None
|
||||
|
||||
|
||||
# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
||||
def llama_sample_top_k(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
k: c_int,
|
||||
min_keep: c_size_t,
|
||||
):
|
||||
return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
|
||||
|
||||
|
||||
_lib.llama_sample_top_k.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
c_int,
|
||||
c_size_t,
|
||||
]
|
||||
_lib.llama_sample_top_k.restype = None
|
||||
|
||||
|
||||
# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
def llama_sample_top_p(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
p: c_float,
|
||||
min_keep: c_size_t,
|
||||
):
|
||||
return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
|
||||
|
||||
|
||||
_lib.llama_sample_top_p.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
c_float,
|
||||
c_size_t,
|
||||
]
|
||||
_lib.llama_sample_top_p.restype = None
|
||||
|
||||
|
||||
# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
||||
def llama_sample_tail_free(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
z: c_float,
|
||||
min_keep: c_size_t,
|
||||
):
|
||||
return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
|
||||
|
||||
|
||||
_lib.llama_sample_tail_free.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
c_float,
|
||||
c_size_t,
|
||||
]
|
||||
_lib.llama_sample_tail_free.restype = None
|
||||
|
||||
|
||||
# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
def llama_sample_typical(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
p: c_float,
|
||||
min_keep: c_size_t,
|
||||
):
|
||||
return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
|
||||
|
||||
|
||||
_lib.llama_sample_typical.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
c_float,
|
||||
c_size_t,
|
||||
]
|
||||
_lib.llama_sample_typical.restype = None
|
||||
|
||||
|
||||
# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
||||
def llama_sample_temperature(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
temp: c_float,
|
||||
):
|
||||
return _lib.llama_sample_temperature(ctx, candidates, temp)
|
||||
|
||||
|
||||
_lib.llama_sample_temperature.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
c_float,
|
||||
]
|
||||
_lib.llama_sample_temperature.restype = None
|
||||
|
||||
|
||||
# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
||||
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
||||
def llama_sample_token_mirostat(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
tau: c_float,
|
||||
eta: c_float,
|
||||
m: c_int,
|
||||
mu, # type: _Pointer[c_float]
|
||||
) -> int:
|
||||
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
|
||||
|
||||
|
||||
_lib.llama_sample_token_mirostat.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
c_float,
|
||||
c_float,
|
||||
c_int,
|
||||
c_float_p,
|
||||
]
|
||||
_lib.llama_sample_token_mirostat.restype = llama_token
|
||||
|
||||
|
||||
# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
||||
def llama_sample_token_mirostat_v2(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
tau: c_float,
|
||||
eta: c_float,
|
||||
mu, # type: _Pointer[c_float]
|
||||
) -> int:
|
||||
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
|
||||
|
||||
|
||||
_lib.llama_sample_token_mirostat_v2.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
c_float,
|
||||
c_float,
|
||||
c_float_p,
|
||||
]
|
||||
_lib.llama_sample_token_mirostat_v2.restype = llama_token
|
||||
|
||||
|
||||
# @details Selects the token with the highest probability.
|
||||
# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
def llama_sample_token_greedy(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
) -> int:
|
||||
return _lib.llama_sample_token_greedy(ctx, candidates)
|
||||
|
||||
|
||||
_lib.llama_sample_token_greedy.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
]
|
||||
_lib.llama_sample_token_greedy.restype = llama_token
|
||||
|
||||
|
||||
# @details Randomly selects a token from the candidates based on their probabilities.
|
||||
# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
def llama_sample_token(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
) -> int:
|
||||
return _lib.llama_sample_token(ctx, candidates)
|
||||
|
||||
|
||||
_lib.llama_sample_token.argtypes = [
|
||||
llama_context_p,
|
||||
llama_token_data_array_p,
|
||||
]
|
||||
_lib.llama_sample_token.restype = llama_token
|
||||
|
||||
|
||||
# Performance information
|
||||
|
||||
|
||||
# LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
def llama_print_timings(ctx: llama_context_p):
|
||||
_lib.llama_print_timings(ctx)
|
||||
|
||||
|
||||
_lib.llama_print_timings.argtypes = [llama_context_p]
|
||||
_lib.llama_print_timings.restype = None
|
||||
|
||||
|
||||
# LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||
def llama_reset_timings(ctx: llama_context_p):
|
||||
_lib.llama_reset_timings(ctx)
|
||||
|
||||
|
||||
_lib.llama_reset_timings.argtypes = [llama_context_p]
|
||||
_lib.llama_reset_timings.restype = None
|
||||
|
||||
|
||||
# Print system information
|
||||
# LLAMA_API const char * llama_print_system_info(void);
|
||||
def llama_print_system_info() -> bytes:
|
||||
return _lib.llama_print_system_info()
|
||||
|
||||
|
||||
_lib.llama_print_system_info.argtypes = []
|
||||
_lib.llama_print_system_info.restype = c_char_p
|
||||
|
||||
###################################################################################################
|
||||
|
||||
|
||||
_llama_initialized = False
|
||||
|
||||
if not _llama_initialized:
|
||||
llama_init_backend()
|
||||
_llama_initialized = True
|
573
examples/low_level_api_chat_cpp.py
Normal file
573
examples/low_level_api_chat_cpp.py
Normal file
|
@ -0,0 +1,573 @@
|
|||
"""
|
||||
This is an example implementation of main.cpp from llama.cpp
|
||||
Quirks:
|
||||
* Its not exactly alike since this port is designed around programmatic I/O
|
||||
* Input is always echoed if on, so it should be turned off when using "input()"
|
||||
* The first antiprompt should be the userprompt like "\nUser:",
|
||||
because its added when n_predict is reached (aka generation ended prematurely)
|
||||
* n_predict can be set to -1 for unlimited length responses (or just a really high value)
|
||||
* Instruction mode adds its own antiprompt.
|
||||
You should also still be feeding the model with a "primer" prompt that
|
||||
shows it the expected format.
|
||||
"""
|
||||
import ctypes
|
||||
import sys
|
||||
from time import time
|
||||
from os import cpu_count, path
|
||||
|
||||
import llama_cpp
|
||||
from common import GptParams, gpt_params_parse, gpt_random_prompt
|
||||
|
||||
ANSI_COLOR_RESET = "\x1b[0m"
|
||||
ANSI_COLOR_YELLOW = "\x1b[33m"
|
||||
ANSI_BOLD = "\x1b[1m"
|
||||
ANSI_COLOR_GREEN = "\x1b[32m"
|
||||
|
||||
CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
|
||||
CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
|
||||
CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
|
||||
|
||||
# Iterative search
|
||||
# Actively searches and prevents a pattern from being returned
|
||||
class IterSearch:
|
||||
def __init__(self, pattern):
|
||||
self.pattern = list(pattern)
|
||||
self.buffer = []
|
||||
|
||||
def __call__(self, char):
|
||||
self.buffer += [char]
|
||||
|
||||
if (self.pattern[:len(self.buffer)] == self.buffer):
|
||||
if (len(self.buffer) >= len(self.pattern)):
|
||||
self.buffer.clear()
|
||||
return []
|
||||
|
||||
_tmp = self.buffer[:]
|
||||
self.buffer.clear()
|
||||
return _tmp
|
||||
|
||||
# A LLaMA interactive session
|
||||
class LLaMAInteract:
|
||||
def __init__(self, params: GptParams) -> None:
|
||||
# input args
|
||||
self.params = params
|
||||
|
||||
if (self.params.perplexity):
|
||||
raise NotImplementedError("""************
|
||||
please use the 'perplexity' tool for perplexity calculations
|
||||
************""")
|
||||
|
||||
if (self.params.embedding):
|
||||
raise NotImplementedError("""************
|
||||
please use the 'embedding' tool for embedding calculations
|
||||
************""")
|
||||
|
||||
if (self.params.n_ctx > 2048):
|
||||
print(f"""warning: model does not support \
|
||||
context sizes greater than 2048 tokens ({self.params.n_ctx} \
|
||||
specified) expect poor results""", file=sys.stderr)
|
||||
|
||||
if (self.params.seed <= 0):
|
||||
self.params.seed = int(time())
|
||||
|
||||
print(f"seed = {self.params.seed}", file=sys.stderr)
|
||||
|
||||
if (self.params.random_prompt):
|
||||
self.params.prompt = gpt_random_prompt(self.params.seed)
|
||||
|
||||
# runtime args
|
||||
self.input_consumed = 0
|
||||
self.n_past = 0
|
||||
self.n_session_consumed = 0
|
||||
self.first_antiprompt = []
|
||||
self.remaining_tokens = self.params.n_predict
|
||||
self.output_echo = self.params.input_echo
|
||||
|
||||
# model load
|
||||
self.lparams = llama_cpp.llama_context_default_params()
|
||||
self.lparams.n_ctx = self.params.n_ctx
|
||||
self.lparams.n_parts = self.params.n_parts
|
||||
self.lparams.seed = self.params.seed
|
||||
self.lparams.memory_f16 = self.params.memory_f16
|
||||
self.lparams.use_mlock = self.params.use_mlock
|
||||
self.lparams.use_mmap = self.params.use_mmap
|
||||
|
||||
self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
|
||||
if (not self.ctx):
|
||||
raise RuntimeError(f"error: failed to load model '{self.params.model}'")
|
||||
|
||||
if (self.params.ignore_eos):
|
||||
self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
|
||||
|
||||
if (len(self.params.lora_adapter) > 0):
|
||||
if (llama_cpp.llama_apply_lora_from_file(
|
||||
self.ctx,
|
||||
self.params.lora_adapter.encode("utf8"),
|
||||
self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None,
|
||||
self.params.n_threads
|
||||
) != 0):
|
||||
print("error: failed to apply lora adapter")
|
||||
return
|
||||
|
||||
print(file=sys.stderr)
|
||||
print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
|
||||
| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
|
||||
|
||||
# determine the required inference memory per token:
|
||||
if (self.params.mem_test):
|
||||
tmp = [0, 1, 2, 3]
|
||||
llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
|
||||
llama_cpp.llama_print_timings(self.ctx)
|
||||
self.exit()
|
||||
return
|
||||
|
||||
# create internal context
|
||||
self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
|
||||
|
||||
# Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
self.params.prompt = " " + self.params.prompt
|
||||
|
||||
# Load prompt file
|
||||
if (self.params.file):
|
||||
with open(self.params.file) as f:
|
||||
self.params.prompt = f.read()
|
||||
|
||||
self.session_tokens: list[llama_cpp.llama_token] = []
|
||||
if (len(self.params.path_session) > 0):
|
||||
print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr)
|
||||
|
||||
if (path.exists(self.params.path_session)):
|
||||
_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
|
||||
_n_token_count_out = llama_cpp.c_size_t()
|
||||
if (llama_cpp.llama_load_session_file(
|
||||
self.ctx,
|
||||
self.params.path_session.encode("utf8"),
|
||||
_session_tokens,
|
||||
self.params.n_ctx,
|
||||
ctypes.byref(_n_token_count_out)
|
||||
) != 1):
|
||||
print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
|
||||
return
|
||||
_n_token_count_out = _n_token_count_out.value
|
||||
self.session_tokens = _session_tokens[:_n_token_count_out]
|
||||
print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
|
||||
else:
|
||||
print(f"session file does not exist, will create", file=sys.stderr)
|
||||
|
||||
# tokenize the prompt
|
||||
self.embd = []
|
||||
self.embd_inp = self._tokenize(self.params.prompt)
|
||||
|
||||
if (len(self.embd_inp) > self.n_ctx - 4):
|
||||
raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
|
||||
|
||||
# debug message about similarity of saved session, if applicable
|
||||
self.n_matching_session_tokens = 0
|
||||
if len(self.session_tokens) > 0:
|
||||
for id in self.session_tokens:
|
||||
if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]:
|
||||
break
|
||||
self.n_matching_session_tokens += 1
|
||||
|
||||
if self.n_matching_session_tokens >= len(self.embd_inp):
|
||||
print(f"session file has exact match for prompt!")
|
||||
elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
|
||||
print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
|
||||
else:
|
||||
print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
|
||||
|
||||
self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
|
||||
|
||||
# number of tokens to keep when resetting context
|
||||
if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
|
||||
self.params.n_keep = len(self.embd_inp)
|
||||
|
||||
self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix)
|
||||
self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
|
||||
|
||||
# in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
self.antiecho = None
|
||||
if (self.params.instruct):
|
||||
self.params.interactive_start = True
|
||||
_ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
|
||||
self.first_antiprompt.append(_ptn)
|
||||
self.antiecho = IterSearch(_ptn)
|
||||
|
||||
# enable interactive mode if reverse prompt or interactive start is specified
|
||||
if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
|
||||
self.params.interactive = True
|
||||
|
||||
# determine newline token
|
||||
self.llama_token_newline = self._tokenize("\n", False)
|
||||
self.llama_token_eot = self._tokenize(" [end of text]\n", False)
|
||||
|
||||
if (self.params.verbose_prompt):
|
||||
print(f"""
|
||||
prompt: '{self.params.prompt}'
|
||||
number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
|
||||
|
||||
for i in range(len(self.embd_inp)):
|
||||
print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr)
|
||||
|
||||
if (self.params.n_keep > 0):
|
||||
print("static prompt based on n_keep: '")
|
||||
for i in range(self.params.n_keep):
|
||||
print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr)
|
||||
print("'", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
if (self.params.interactive):
|
||||
print("interactive mode on.", file=sys.stderr)
|
||||
|
||||
if (len(self.params.antiprompt) > 0):
|
||||
for antiprompt in self.params.antiprompt:
|
||||
print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr)
|
||||
|
||||
if len(self.params.input_prefix) > 0:
|
||||
print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
|
||||
|
||||
print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
|
||||
repeat_penalty = {self.params.repeat_penalty},\
|
||||
presence_penalty = {self.params.presence_penalty},\
|
||||
frequency_penalty = {self.params.frequency_penalty},\
|
||||
top_k = {self.params.top_k},\
|
||||
tfs_z = {self.params.tfs_z},\
|
||||
top_p = {self.params.top_p},\
|
||||
typical_p = {self.params.typical_p},\
|
||||
temp = {self.params.temp},\
|
||||
mirostat = {self.params.mirostat},\
|
||||
mirostat_lr = {self.params.mirostat_eta},\
|
||||
mirostat_ent = {self.params.mirostat_tau},\
|
||||
|
||||
generate: n_ctx = {self.n_ctx},\
|
||||
n_batch = {self.params.n_batch},\
|
||||
n_predict = {self.params.n_predict},\
|
||||
n_keep = {self.params.n_keep}
|
||||
|
||||
""", file=sys.stderr)
|
||||
|
||||
# determine antiprompt tokens
|
||||
for i in self.params.antiprompt:
|
||||
self.first_antiprompt.append(self._tokenize(i, False))
|
||||
|
||||
self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
|
||||
|
||||
if (params.interactive):
|
||||
print("""== Running in interactive mode. ==
|
||||
- Press Ctrl+C to interject at any time.
|
||||
- Press Return to return control to LLaMa.
|
||||
- If you want to submit another line, end your input in '\\'.
|
||||
|
||||
""", file=sys.stderr)
|
||||
self.set_color(CONSOLE_COLOR_PROMPT)
|
||||
|
||||
# tokenize a prompt
|
||||
def _tokenize(self, prompt, bos=True):
|
||||
_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
|
||||
_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
|
||||
return _arr[:_n]
|
||||
|
||||
def set_color(self, c):
|
||||
if (self.params.use_color):
|
||||
print(c, end="")
|
||||
|
||||
def use_antiprompt(self):
|
||||
return len(self.first_antiprompt) > 0
|
||||
|
||||
# generate tokens
|
||||
def generate(self):
|
||||
while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1:
|
||||
# predict
|
||||
if len(self.embd) > 0:
|
||||
# infinite text generation via context swapping
|
||||
# if we run out of context:
|
||||
# - take the n_keep first tokens from the original prompt (via n_past)
|
||||
# - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
|
||||
if (self.n_past + len(self.embd) > self.n_ctx):
|
||||
n_left = self.n_past - self.params.n_keep
|
||||
self.n_past = self.params.n_keep
|
||||
|
||||
# insert n_left/2 tokens at the start of embd from last_n_tokens
|
||||
_insert = self.last_n_tokens[
|
||||
self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
|
||||
]
|
||||
self.embd = _insert + self.embd
|
||||
self.params.path_session = ""
|
||||
|
||||
# try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
|
||||
# REVIEW
|
||||
if self.n_session_consumed < len(self.session_tokens):
|
||||
for i in range(len(self.embd)):
|
||||
if self.embd[i] != self.session_tokens[self.n_session_consumed]:
|
||||
self.session_tokens = self.session_tokens[:self.n_session_consumed]
|
||||
break
|
||||
|
||||
self.n_past += 1
|
||||
self.n_session_consumed += 1
|
||||
|
||||
if self.n_session_consumed >= len(self.session_tokens):
|
||||
i += 1
|
||||
break
|
||||
|
||||
if i > 0:
|
||||
self.embd = self.embd[i:]
|
||||
|
||||
# evaluate tokens in batches
|
||||
# embd is typically prepared beforehand to fit within a batch, but not always
|
||||
#TODO BUG: The batching code causes nonsensical generation
|
||||
"""for i in range(0, len(self.embd), self.params.n_batch):
|
||||
n_eval = self.params.n_batch
|
||||
_arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
|
||||
if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
|
||||
print(f"failed to eval")
|
||||
return
|
||||
|
||||
self.n_past += n_eval"""
|
||||
|
||||
if (llama_cpp.llama_eval(
|
||||
self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
|
||||
) != 0):
|
||||
raise Exception("Failed to llama_eval!")
|
||||
|
||||
if len(self.embd) > 0 and len(self.params.path_session) > 0:
|
||||
self.session_tokens.extend(self.embd)
|
||||
self.n_session_consumed = len(self.session_tokens)
|
||||
|
||||
self.n_past += len(self.embd)
|
||||
self.embd = []
|
||||
if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting
|
||||
# out of user input, sample next token
|
||||
top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k
|
||||
repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n
|
||||
|
||||
# optionally save the session on first sample (for faster prompt loading next time)
|
||||
if len(self.params.path_session) > 0 and self.need_to_save_session:
|
||||
self.need_to_save_session = False
|
||||
llama_cpp.llama_save_session_file(
|
||||
self.ctx,
|
||||
self.params.path_session.encode("utf8"),
|
||||
(llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens),
|
||||
len(self.session_tokens)
|
||||
)
|
||||
|
||||
id = 0
|
||||
|
||||
logits = llama_cpp.llama_get_logits(self.ctx)
|
||||
n_vocab = llama_cpp.llama_n_vocab(self.ctx)
|
||||
|
||||
# Apply params.logit_bias map
|
||||
for key, value in self.params.logit_bias.items():
|
||||
logits[key] += value
|
||||
|
||||
_arr = (llama_cpp.llama_token_data * n_vocab)(*[
|
||||
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
|
||||
for token_id in range(n_vocab)
|
||||
])
|
||||
candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
|
||||
|
||||
# Apply penalties
|
||||
nl_logit = logits[llama_cpp.llama_token_nl()]
|
||||
last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
|
||||
|
||||
_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
|
||||
llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
|
||||
_arr,
|
||||
last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
|
||||
llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
|
||||
_arr,
|
||||
last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
|
||||
|
||||
if not self.params.penalize_nl:
|
||||
logits[llama_cpp.llama_token_nl()] = nl_logit
|
||||
|
||||
if self.params.temp <= 0:
|
||||
# Greedy sampling
|
||||
id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
|
||||
else:
|
||||
if self.params.mirostat == 1:
|
||||
mirostat_mu = 2.0 * self.params.mirostat_tau
|
||||
mirostat_m = 100
|
||||
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
|
||||
id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu))
|
||||
elif self.params.mirostat == 2:
|
||||
mirostat_mu = 2.0 * self.params.mirostat_tau
|
||||
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
|
||||
id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
|
||||
else:
|
||||
# Temperature sampling
|
||||
llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z),min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p),min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p),min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
|
||||
id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
|
||||
# print("`{}`".format(candidates_p.size))
|
||||
|
||||
self.last_n_tokens.pop(0)
|
||||
self.last_n_tokens.append(id)
|
||||
|
||||
# replace end of text token with newline token when in interactive mode
|
||||
if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
|
||||
id = self.llama_token_newline[0]
|
||||
self.embd.append(id)
|
||||
if (self.use_antiprompt()):
|
||||
# tokenize and inject first reverse prompt
|
||||
self.embd_inp += self.first_antiprompt[0]
|
||||
for id in self.first_antiprompt[0]:
|
||||
self.embd.append(id)
|
||||
else:
|
||||
# add it to the context
|
||||
self.embd.append(id)
|
||||
|
||||
# echo this to console
|
||||
self.output_echo = True
|
||||
|
||||
# decrement remaining sampling budget
|
||||
self.remaining_tokens -= 1
|
||||
else:
|
||||
# output to console if input echo is on
|
||||
self.output_echo = self.params.input_echo
|
||||
|
||||
# some user input remains from prompt or interaction, forward it to processing
|
||||
while len(self.embd_inp) > self.input_consumed:
|
||||
self.embd.append(self.embd_inp[self.input_consumed])
|
||||
self.last_n_tokens.pop(0)
|
||||
self.last_n_tokens.append(self.embd_inp[self.input_consumed])
|
||||
self.input_consumed += 1
|
||||
if len(self.embd) >= self.params.n_batch:
|
||||
break
|
||||
|
||||
# display tokens
|
||||
if self.output_echo:
|
||||
for id in self.embd:
|
||||
if self.antiecho != None:
|
||||
for r in self.antiecho(id):
|
||||
yield r
|
||||
else:
|
||||
yield id
|
||||
|
||||
# reset color to default if we there is no pending user input
|
||||
if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
|
||||
self.set_color(CONSOLE_COLOR_DEFAULT)
|
||||
|
||||
if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
|
||||
# if antiprompt is present, stop
|
||||
if (self.use_antiprompt()):
|
||||
if True in [
|
||||
i == self.last_n_tokens[-len(i):]
|
||||
for i in self.first_antiprompt
|
||||
]:
|
||||
break
|
||||
|
||||
# if we are using instruction mode, and we have processed the initial prompt
|
||||
if (self.params.interactive_start):
|
||||
break
|
||||
|
||||
# end of text token
|
||||
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
|
||||
if (not self.params.instruct):
|
||||
for i in self.llama_token_eot:
|
||||
yield i
|
||||
break
|
||||
|
||||
# respect n_predict even if antiprompt is present
|
||||
if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
|
||||
# If we arent in instruction mode, fix the current generation by appending the antiprompt.
|
||||
# Makes it so if chat ends prematurely you dont append the AI's text etc.
|
||||
if not self.params.instruct:
|
||||
self.embd_inp += self.first_antiprompt[0]
|
||||
self.n_remain = self.params.n_predict
|
||||
break
|
||||
|
||||
self.params.interactive_start = False
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, tb):
|
||||
self.exit()
|
||||
|
||||
def exit(self):
|
||||
llama_cpp.llama_free(self.ctx)
|
||||
self.set_color(CONSOLE_COLOR_DEFAULT)
|
||||
|
||||
# return past text
|
||||
def past(self):
|
||||
for id in self.last_n_tokens[-self.n_past:]:
|
||||
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
|
||||
|
||||
# write input
|
||||
def input(self, prompt: str):
|
||||
if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
|
||||
self.embd_inp += self.inp_prefix
|
||||
self.embd_inp += self._tokenize(prompt)
|
||||
if (self.params.instruct):
|
||||
self.embd_inp += self.inp_suffix
|
||||
|
||||
# write output
|
||||
def output(self):
|
||||
self.remaining_tokens = self.params.n_predict
|
||||
for id in self.generate():
|
||||
yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
|
||||
|
||||
# read user input
|
||||
def read_input(self):
|
||||
out = ""
|
||||
while (t := input()).endswith("\\"):
|
||||
out += t[:-1] + "\n"
|
||||
return out + t + "\n"
|
||||
|
||||
# interactive mode
|
||||
def interact(self):
|
||||
for i in self.output():
|
||||
print(i,end="",flush=True)
|
||||
self.params.input_echo = False
|
||||
|
||||
while self.params.interactive:
|
||||
self.set_color(CONSOLE_COLOR_USER_INPUT)
|
||||
if (self.params.instruct):
|
||||
print('\n> ', end="")
|
||||
self.input(self.read_input())
|
||||
else:
|
||||
print(self.params.input_prefix, end="")
|
||||
self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
|
||||
print(self.params.input_suffix,end="")
|
||||
self.set_color(CONSOLE_COLOR_DEFAULT)
|
||||
|
||||
try:
|
||||
for i in self.output():
|
||||
print(i,end="",flush=True)
|
||||
except KeyboardInterrupt:
|
||||
self.set_color(CONSOLE_COLOR_DEFAULT)
|
||||
if not self.params.instruct:
|
||||
print(self.params.fix_prefix,end="")
|
||||
self.input(self.params.fix_prefix)
|
||||
|
||||
if __name__ == "__main__":
|
||||
from datetime import datetime
|
||||
|
||||
USER_NAME="User"
|
||||
AI_NAME="ChatLLaMa"
|
||||
|
||||
time_now = datetime.now()
|
||||
prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
|
||||
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
|
||||
There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
|
||||
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
|
||||
The transcript only includes text, it does not include markup like HTML and Markdown.
|
||||
|
||||
{USER_NAME}: Hello, {AI_NAME}!
|
||||
{AI_NAME}: Hello {USER_NAME}! How may I help you today?
|
||||
{USER_NAME}: What time is it?
|
||||
{AI_NAME}: It is {time_now.strftime("%H:%M")}.
|
||||
{USER_NAME}: What year is it?
|
||||
{AI_NAME}: We are in {time_now.strftime("%Y")}.
|
||||
{USER_NAME}: What is a cat?
|
||||
{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
|
||||
{USER_NAME}: Name a color.
|
||||
{AI_NAME}: Blue
|
||||
{USER_NAME}:"""
|
||||
params = gpt_params_parse()
|
||||
|
||||
with LLaMAInteract(params) as m:
|
||||
m.interact()
|
102
examples/low_level_api_llama_cpp.py
Normal file
102
examples/low_level_api_llama_cpp.py
Normal file
|
@ -0,0 +1,102 @@
|
|||
import llama_cpp
|
||||
|
||||
import multiprocessing
|
||||
|
||||
import llama_cpp
|
||||
|
||||
N_THREADS = multiprocessing.cpu_count()
|
||||
|
||||
prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
|
||||
|
||||
lparams = llama_cpp.llama_context_default_params()
|
||||
ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams)
|
||||
|
||||
# determine the required inference memory per token:
|
||||
tmp = [0, 1, 2, 3]
|
||||
llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS)
|
||||
|
||||
n_past = 0
|
||||
|
||||
prompt = b" " + prompt
|
||||
|
||||
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
|
||||
n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True)
|
||||
embd_inp = embd_inp[:n_of_tok]
|
||||
|
||||
n_ctx = llama_cpp.llama_n_ctx(ctx)
|
||||
|
||||
n_predict = 20
|
||||
n_predict = min(n_predict, n_ctx - len(embd_inp))
|
||||
|
||||
input_consumed = 0
|
||||
input_noecho = False
|
||||
|
||||
remaining_tokens = n_predict
|
||||
|
||||
embd = []
|
||||
last_n_size = 64
|
||||
last_n_tokens_data = [0] * last_n_size
|
||||
n_batch = 24
|
||||
last_n_repeat = 64
|
||||
repeat_penalty = 1
|
||||
frequency_penalty = 0.0
|
||||
presence_penalty = 0.0
|
||||
|
||||
while remaining_tokens > 0:
|
||||
if len(embd) > 0:
|
||||
llama_cpp.llama_eval(
|
||||
ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS
|
||||
)
|
||||
|
||||
n_past += len(embd)
|
||||
embd = []
|
||||
if len(embd_inp) <= input_consumed:
|
||||
logits = llama_cpp.llama_get_logits(ctx)
|
||||
n_vocab = llama_cpp.llama_n_vocab(ctx)
|
||||
|
||||
_arr = (llama_cpp.llama_token_data * n_vocab)(*[
|
||||
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
|
||||
for token_id in range(n_vocab)
|
||||
])
|
||||
candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
|
||||
|
||||
_arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
|
||||
llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
|
||||
_arr,
|
||||
last_n_repeat, repeat_penalty)
|
||||
llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
|
||||
_arr,
|
||||
last_n_repeat, frequency_penalty, presence_penalty)
|
||||
|
||||
llama_cpp.llama_sample_top_k(ctx, candidates_p, 40, min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
|
||||
id = llama_cpp.llama_sample_token(ctx, candidates_p)
|
||||
|
||||
last_n_tokens_data = last_n_tokens_data[1:] + [id]
|
||||
embd.append(id)
|
||||
input_noecho = False
|
||||
remaining_tokens -= 1
|
||||
else:
|
||||
while len(embd_inp) > input_consumed:
|
||||
embd.append(embd_inp[input_consumed])
|
||||
last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
|
||||
input_consumed += 1
|
||||
if len(embd) >= n_batch:
|
||||
break
|
||||
if not input_noecho:
|
||||
for id in embd:
|
||||
print(
|
||||
llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos():
|
||||
break
|
||||
|
||||
print()
|
||||
|
||||
llama_cpp.llama_print_timings(ctx)
|
||||
|
||||
llama_cpp.llama_free(ctx)
|
25
examples/quantize.py
Normal file
25
examples/quantize.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import os
|
||||
import argparse
|
||||
import llama_cpp
|
||||
|
||||
|
||||
def main(args):
|
||||
if not os.path.exists(fname_inp):
|
||||
raise RuntimeError(f"Input file does not exist ({fname_inp})")
|
||||
if os.path.exists(fname_out):
|
||||
raise RuntimeError(f"Output file already exists ({fname_out})")
|
||||
fname_inp = args.fname_inp.encode("utf-8")
|
||||
fname_out = args.fname_out.encode("utf-8")
|
||||
itype = args.itype
|
||||
return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
|
||||
if return_code != 0:
|
||||
raise RuntimeError("Failed to quantize model")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("fname_inp", type=str, help="Path to input model")
|
||||
parser.add_argument("fname_out", type=str, help="Path to output model")
|
||||
parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
Loading…
Add table
Add a link
Reference in a new issue