From d9dfdec2bdb1caa9aea3c272f82ababbb61d664d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 23 Mar 2023 05:33:06 -0400 Subject: [PATCH 01/77] Initial commit (llama_cpp.py, llama-cpp-python) --- examples/llama_cpp.py | 216 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 examples/llama_cpp.py diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py new file mode 100644 index 000000000..9e741dfc6 --- /dev/null +++ b/examples/llama_cpp.py @@ -0,0 +1,216 @@ +import ctypes + +from ctypes import ( + c_int, + c_float, + c_double, + c_char_p, + c_void_p, + c_bool, + POINTER, + Structure, +) + +import pathlib + +# Load the library +libfile = pathlib.Path(__file__).parent / "libllama.so" +_lib = ctypes.CDLL(str(libfile)) + + +# C types +llama_token = c_int +llama_token_p = POINTER(llama_token) + + +class llama_token_data(Structure): + _fields_ = [ + ("id", llama_token), # token id + ("p", c_float), # probability of the token + ("plog", c_float), # log probability of the token + ] + + +llama_token_data_p = POINTER(llama_token_data) + + +class llama_context_params(Structure): + _fields_ = [ + ("n_ctx", c_int), # text context + ("n_parts", c_int), # -1 for default + ("seed", c_int), # RNG seed, 0 for random + ("f16_kv", c_bool), # use fp16 for KV cache + ( + "logits_all", + c_bool, + ), # the llama_eval() call computes all logits, not just the last one + ("vocab_only", c_bool), # only load the vocabulary, no weights + ] + + +llama_context_params_p = POINTER(llama_context_params) + +llama_context_p = c_void_p + +# C functions +lib.llama_context_default_params.argtypes = [] +lib.llama_context_default_params.restype = llama_context_params + +lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] +lib.llama_init_from_file.restype = llama_context_p + +lib.llama_free.argtypes = [llama_context_p] +lib.llama_free.restype = None + +lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] +lib.llama_model_quantize.restype = c_int + +lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] +lib.llama_eval.restype = c_int + +lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] +lib.llama_tokenize.restype = c_int + +lib.llama_n_vocab.argtypes = [llama_context_p] +lib.llama_n_vocab.restype = c_int + +lib.llama_n_ctx.argtypes = [llama_context_p] +lib.llama_n_ctx.restype = c_int + +lib.llama_get_logits.argtypes = [llama_context_p] +lib.llama_get_logits.restype = POINTER(c_float) + +lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] +lib.llama_token_to_str.restype = c_char_p + +lib.llama_token_bos.argtypes = [] +lib.llama_token_bos.restype = llama_token + +lib.llama_token_eos.argtypes = [] +lib.llama_token_eos.restype = llama_token + +lib.llama_sample_top_p_top_k.argtypes = [ + llama_context_p, + llama_token_p, + c_int, + c_int, + c_double, + c_double, + c_double, +] +lib.llama_sample_top_p_top_k.restype = llama_token + +lib.llama_print_timings.argtypes = [llama_context_p] +lib.llama_print_timings.restype = None + +lib.llama_reset_timings.argtypes = [llama_context_p] +lib.llama_reset_timings.restype = None + +lib.llama_print_system_info.argtypes = [] +lib.llama_print_system_info.restype = c_char_p + + +# Python functions +def llama_context_default_params() -> llama_context_params: + return _lib.llama_context_default_params() + + +def llama_init_from_file( + path_model: bytes, params: llama_context_params +) -> llama_context_p: + """Various functions for loading a ggml llama model. + Allocate (almost) all memory needed for the model. + Return NULL on failure""" + return _lib.llama_init_from_file(path_model, params) + + +def llama_free(ctx: llama_context_p): + """Free all allocated memory""" + return _lib.llama_free(ctx) + + +def llama_model_quantize( + fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int +) -> c_int: + """Returns 0 on success""" + return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk) + + +def llama_eval( + ctx: llama_context_p, + tokens: llama_token_p, + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> c_int: + """Run the llama inference to obtain the logits and probabilities for the next token. + tokens + n_tokens is the provided batch of new tokens to process + n_past is the number of tokens to use from previous eval calls + Returns 0 on success""" + return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) + + +def llama_tokenize( + ctx: llama_context_p, + text: bytes, + tokens: llama_token_p, + n_max_tokens: c_int, + add_bos: c_bool, +) -> c_int: + return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) + + +def llama_n_vocab(ctx: llama_context_p) -> c_int: + return _lib.llama_n_vocab(ctx) + + +def llama_n_ctx(ctx: llama_context_p) -> c_int: + return _lib.llama_n_ctx(ctx) + + +def llama_get_logits(ctx: llama_context_p): + """Token logits obtained from the last call to llama_eval() + The logits for the last token are stored in the last row + Can be mutated in order to change the probabilities of the next token + Rows: n_tokens + Cols: n_vocab""" + return _lib.llama_get_logits(ctx) + + +def llama_token_to_str(ctx: llama_context_p, token: int) -> bytes: + """Token Id -> String. Uses the vocabulary in the provided context""" + return _lib.llama_token_to_str(ctx, token) + + +def llama_token_bos() -> llama_token: + return _lib.llama_token_bos() + + +def llama_token_eos() -> llama_token: + return _lib.llama_token_eos() + + +def llama_sample_top_p_top_k( + ctx: llama_context_p, + last_n_tokens_data: llama_token_p, + last_n_tokens_size: c_int, + top_k: c_int, + top_p: c_double, + temp: c_double, + repeat_penalty: c_double, +) -> llama_token: + return _lib.llama_sample_top_p_top_k( + ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty + ) + + +def llama_print_timings(ctx: llama_context_p): + _lib.llama_print_timings(ctx) + + +def llama_reset_timings(ctx: llama_context_p): + _lib.llama_reset_timings(ctx) + + +def llama_print_system_info() -> bytes: + return _lib.llama_print_system_info() From ef5a9a616014828294070311526a802c07b52ec7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 24 Mar 2023 14:58:42 -0400 Subject: [PATCH 02/77] Update llama.cpp and re-organize low-level api --- examples/llama_cpp.py | 189 ++++++++++++++++++++++++++---------------- 1 file changed, 116 insertions(+), 73 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 9e741dfc6..638f14238 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -19,6 +19,9 @@ _lib = ctypes.CDLL(str(libfile)) # C types +llama_context_p = c_void_p + + llama_token = c_int llama_token_p = POINTER(llama_token) @@ -45,97 +48,63 @@ class llama_context_params(Structure): c_bool, ), # the llama_eval() call computes all logits, not just the last one ("vocab_only", c_bool), # only load the vocabulary, no weights + ("use_mlock", c_bool), # force system to keep model in RAM + ("embedding", c_bool), # embedding mode only ] llama_context_params_p = POINTER(llama_context_params) -llama_context_p = c_void_p -# C functions -lib.llama_context_default_params.argtypes = [] -lib.llama_context_default_params.restype = llama_context_params - -lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] -lib.llama_init_from_file.restype = llama_context_p - -lib.llama_free.argtypes = [llama_context_p] -lib.llama_free.restype = None - -lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] -lib.llama_model_quantize.restype = c_int - -lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] -lib.llama_eval.restype = c_int - -lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] -lib.llama_tokenize.restype = c_int - -lib.llama_n_vocab.argtypes = [llama_context_p] -lib.llama_n_vocab.restype = c_int - -lib.llama_n_ctx.argtypes = [llama_context_p] -lib.llama_n_ctx.restype = c_int - -lib.llama_get_logits.argtypes = [llama_context_p] -lib.llama_get_logits.restype = POINTER(c_float) - -lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] -lib.llama_token_to_str.restype = c_char_p - -lib.llama_token_bos.argtypes = [] -lib.llama_token_bos.restype = llama_token - -lib.llama_token_eos.argtypes = [] -lib.llama_token_eos.restype = llama_token - -lib.llama_sample_top_p_top_k.argtypes = [ - llama_context_p, - llama_token_p, - c_int, - c_int, - c_double, - c_double, - c_double, -] -lib.llama_sample_top_p_top_k.restype = llama_token - -lib.llama_print_timings.argtypes = [llama_context_p] -lib.llama_print_timings.restype = None - -lib.llama_reset_timings.argtypes = [llama_context_p] -lib.llama_reset_timings.restype = None - -lib.llama_print_system_info.argtypes = [] -lib.llama_print_system_info.restype = c_char_p +# Functions -# Python functions def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() +_lib.llama_context_default_params.argtypes = [] +_lib.llama_context_default_params.restype = llama_context_params + + +# Various functions for loading a ggml llama model. +# Allocate (almost) all memory needed for the model. +# Return NULL on failure def llama_init_from_file( path_model: bytes, params: llama_context_params ) -> llama_context_p: - """Various functions for loading a ggml llama model. - Allocate (almost) all memory needed for the model. - Return NULL on failure""" return _lib.llama_init_from_file(path_model, params) +_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] +_lib.llama_init_from_file.restype = llama_context_p + + +# Frees all allocated memory def llama_free(ctx: llama_context_p): - """Free all allocated memory""" return _lib.llama_free(ctx) +_lib.llama_free.argtypes = [llama_context_p] +_lib.llama_free.restype = None + + +# TODO: not great API - very likely to change +# Returns 0 on success def llama_model_quantize( fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int ) -> c_int: - """Returns 0 on success""" return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk) +_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] +_lib.llama_model_quantize.restype = c_int + + +# Run the llama inference to obtain the logits and probabilities for the next token. +# tokens + n_tokens is the provided batch of new tokens to process +# n_past is the number of tokens to use from previous eval calls +# Returns 0 on success def llama_eval( ctx: llama_context_p, tokens: llama_token_p, @@ -143,13 +112,18 @@ def llama_eval( n_past: c_int, n_threads: c_int, ) -> c_int: - """Run the llama inference to obtain the logits and probabilities for the next token. - tokens + n_tokens is the provided batch of new tokens to process - n_past is the number of tokens to use from previous eval calls - Returns 0 on success""" return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) +_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] +_lib.llama_eval.restype = c_int + + +# Convert the provided text into tokens. +# The tokens pointer must be large enough to hold the resulting tokens. +# Returns the number of tokens on success, no more than n_max_tokens +# Returns a negative number on failure - the number of tokens that would have been returned +# TODO: not sure if correct def llama_tokenize( ctx: llama_context_p, text: bytes, @@ -160,36 +134,77 @@ def llama_tokenize( return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) +_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] +_lib.llama_tokenize.restype = c_int + + def llama_n_vocab(ctx: llama_context_p) -> c_int: return _lib.llama_n_vocab(ctx) +_lib.llama_n_vocab.argtypes = [llama_context_p] +_lib.llama_n_vocab.restype = c_int + + def llama_n_ctx(ctx: llama_context_p) -> c_int: return _lib.llama_n_ctx(ctx) +_lib.llama_n_ctx.argtypes = [llama_context_p] +_lib.llama_n_ctx.restype = c_int + + +# Token logits obtained from the last call to llama_eval() +# The logits for the last token are stored in the last row +# Can be mutated in order to change the probabilities of the next token +# Rows: n_tokens +# Cols: n_vocab def llama_get_logits(ctx: llama_context_p): - """Token logits obtained from the last call to llama_eval() - The logits for the last token are stored in the last row - Can be mutated in order to change the probabilities of the next token - Rows: n_tokens - Cols: n_vocab""" return _lib.llama_get_logits(ctx) +_lib.llama_get_logits.argtypes = [llama_context_p] +_lib.llama_get_logits.restype = POINTER(c_float) + + +# Get the embeddings for the input +# shape: [n_embd] (1-dimensional) +def llama_get_embeddings(ctx: llama_context_p): + return _lib.llama_get_embeddings(ctx) + + +_lib.llama_get_embeddings.argtypes = [llama_context_p] +_lib.llama_get_embeddings.restype = POINTER(c_float) + + +# Token Id -> String. Uses the vocabulary in the provided context def llama_token_to_str(ctx: llama_context_p, token: int) -> bytes: - """Token Id -> String. Uses the vocabulary in the provided context""" return _lib.llama_token_to_str(ctx, token) +_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] +_lib.llama_token_to_str.restype = c_char_p + +# Special tokens + + def llama_token_bos() -> llama_token: return _lib.llama_token_bos() +_lib.llama_token_bos.argtypes = [] +_lib.llama_token_bos.restype = llama_token + + def llama_token_eos() -> llama_token: return _lib.llama_token_eos() +_lib.llama_token_eos.argtypes = [] +_lib.llama_token_eos.restype = llama_token + + +# TODO: improve the last_n_tokens interface ? def llama_sample_top_p_top_k( ctx: llama_context_p, last_n_tokens_data: llama_token_p, @@ -204,13 +219,41 @@ def llama_sample_top_p_top_k( ) +_lib.llama_sample_top_p_top_k.argtypes = [ + llama_context_p, + llama_token_p, + c_int, + c_int, + c_double, + c_double, + c_double, +] +_lib.llama_sample_top_p_top_k.restype = llama_token + + +# Performance information + + def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) +_lib.llama_print_timings.argtypes = [llama_context_p] +_lib.llama_print_timings.restype = None + + def llama_reset_timings(ctx: llama_context_p): _lib.llama_reset_timings(ctx) +_lib.llama_reset_timings.argtypes = [llama_context_p] +_lib.llama_reset_timings.restype = None + + +# Print system information def llama_print_system_info() -> bytes: return _lib.llama_print_system_info() + + +_lib.llama_print_system_info.argtypes = [] +_lib.llama_print_system_info.restype = c_char_p From bd1c657f80ffe6b8cf56a55a39b16eaa20e5a056 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 4 Apr 2023 22:36:59 -0400 Subject: [PATCH 03/77] Bugfix: wrong signature for quantize function --- examples/llama_cpp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 638f14238..f7149ed67 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -92,12 +92,12 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success def llama_model_quantize( - fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int + fname_inp: bytes, fname_out: bytes, itype: c_int ) -> c_int: - return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk) + return _lib.llama_model_quantize(fname_inp, fname_out, itype) -_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] +_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] _lib.llama_model_quantize.restype = c_int From a3da39af79cecb1dd94bf9e01f11d7a06a1b493e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 24 Mar 2023 18:43:29 -0400 Subject: [PATCH 04/77] Bugfix: cross-platform method to find shared lib --- examples/llama_cpp.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index f7149ed67..bafc40112 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -12,11 +12,15 @@ from ctypes import ( ) import pathlib +from itertools import chain # Load the library -libfile = pathlib.Path(__file__).parent / "libllama.so" -_lib = ctypes.CDLL(str(libfile)) - +# TODO: fragile, should fix +_base_path = pathlib.Path(__file__).parent +(_lib_path,) = chain( + _base_path.glob("*.so"), _base_path.glob("*.dylib"), _base_path.glob("*.dll") +) +_lib = ctypes.CDLL(str(_lib_path)) # C types llama_context_p = c_void_p From 019650f41628d72ee0cfb1448e0d259f22fccaff Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 31 Mar 2023 02:08:20 -0400 Subject: [PATCH 05/77] Fix array type signatures --- examples/llama_cpp.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index bafc40112..1e8054e5d 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -111,7 +111,7 @@ _lib.llama_model_quantize.restype = c_int # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens: llama_token_p, + tokens: ctypes.Array[llama_token], n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -131,7 +131,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens: llama_token_p, + tokens: ctypes.Array[llama_token], n_max_tokens: c_int, add_bos: c_bool, ) -> c_int: @@ -163,7 +163,7 @@ _lib.llama_n_ctx.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): +def llama_get_logits(ctx: llama_context_p) -> ctypes.Array[c_float]: return _lib.llama_get_logits(ctx) @@ -173,7 +173,7 @@ _lib.llama_get_logits.restype = POINTER(c_float) # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): +def llama_get_embeddings(ctx: llama_context_p) -> ctypes.Array[c_float]: return _lib.llama_get_embeddings(ctx) @@ -211,7 +211,7 @@ _lib.llama_token_eos.restype = llama_token # TODO: improve the last_n_tokens interface ? def llama_sample_top_p_top_k( ctx: llama_context_p, - last_n_tokens_data: llama_token_p, + last_n_tokens_data: ctypes.Array[llama_token], last_n_tokens_size: c_int, top_k: c_int, top_p: c_double, From a7a6d88793deaf73629adffefc9e820dda5c52ef Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 31 Mar 2023 03:20:15 -0400 Subject: [PATCH 06/77] Fix ctypes typing issue for Arrays --- examples/llama_cpp.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 1e8054e5d..2a43ca328 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -9,8 +9,8 @@ from ctypes import ( c_bool, POINTER, Structure, + Array ) - import pathlib from itertools import chain @@ -111,7 +111,7 @@ _lib.llama_model_quantize.restype = c_int # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens: ctypes.Array[llama_token], + tokens, # type: Array[llama_token] n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -131,7 +131,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens: ctypes.Array[llama_token], + tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, ) -> c_int: @@ -163,7 +163,7 @@ _lib.llama_n_ctx.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p) -> ctypes.Array[c_float]: +def llama_get_logits(ctx: llama_context_p): return _lib.llama_get_logits(ctx) @@ -173,7 +173,7 @@ _lib.llama_get_logits.restype = POINTER(c_float) # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p) -> ctypes.Array[c_float]: +def llama_get_embeddings(ctx: llama_context_p): return _lib.llama_get_embeddings(ctx) @@ -211,7 +211,7 @@ _lib.llama_token_eos.restype = llama_token # TODO: improve the last_n_tokens interface ? def llama_sample_top_p_top_k( ctx: llama_context_p, - last_n_tokens_data: ctypes.Array[llama_token], + last_n_tokens_data, # type: Array[llama_token] last_n_tokens_size: c_int, top_k: c_int, top_p: c_double, From 5bb1bc74d1764059a2bae937bddd8960d5e46e27 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 31 Mar 2023 03:25:12 -0400 Subject: [PATCH 07/77] Fix type signature of token_to_str --- examples/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 2a43ca328..214050855 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -182,7 +182,7 @@ _lib.llama_get_embeddings.restype = POINTER(c_float) # Token Id -> String. Uses the vocabulary in the provided context -def llama_token_to_str(ctx: llama_context_p, token: int) -> bytes: +def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) From def46dd9a68a6d6fb7818885efbd59e97175ec63 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 24 Mar 2023 18:57:25 -0400 Subject: [PATCH 08/77] Add example based on stripped down version of main.cpp from llama.cpp --- examples/low_level_api_llama_cpp.py | 85 +++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 examples/low_level_api_llama_cpp.py diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py new file mode 100644 index 000000000..4a888c355 --- /dev/null +++ b/examples/low_level_api_llama_cpp.py @@ -0,0 +1,85 @@ +import llama_cpp + +import multiprocessing + +import llama_cpp + +N_THREADS = multiprocessing.cpu_count() + +prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" + +lparams = llama_cpp.llama_context_default_params() +ctx = llama_cpp.llama_init_from_file(b"models/ggml-alpaca-7b-q4.bin", lparams) + +# determine the required inference memory per token: +tmp = [0, 1, 2, 3] +llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS) + +n_past = 0 + +prompt = b" " + prompt + +embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() +n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True) +embd_inp = embd_inp[:n_of_tok] + +n_ctx = llama_cpp.llama_n_ctx(ctx) + +n_predict = 20 +n_predict = min(n_predict, n_ctx - len(embd_inp)) + +input_consumed = 0 +input_noecho = False + +remaining_tokens = n_predict + +embd = [] +last_n_size = 64 +last_n_tokens = [0] * last_n_size +n_batch = 24 + +while remaining_tokens > 0: + if len(embd) > 0: + llama_cpp.llama_eval( + ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS + ) + + n_past += len(embd) + embd = [] + if len(embd_inp) <= input_consumed: + id = llama_cpp.llama_sample_top_p_top_k( + ctx, + (llama_cpp.c_int * len(last_n_tokens))(*last_n_tokens), + len(last_n_tokens), + 40, + 0.8, + 0.2, + 1.0 / 0.85, + ) + last_n_tokens = last_n_tokens[1:] + [id] + embd.append(id) + input_noecho = False + remaining_tokens -= 1 + else: + while len(embd_inp) > input_consumed: + embd.append(embd_inp[input_consumed]) + last_n_tokens = last_n_tokens[1:] + [embd_inp[input_consumed]] + input_consumed += 1 + if len(embd) >= n_batch: + break + if not input_noecho: + for id in embd: + print( + llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"), + end="", + flush=True, + ) + + if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(): + break + +print() + +llama_cpp.llama_print_timings(ctx) + +llama_cpp.llama_free(ctx) From ef3c152257a357542be6a99eb6e44394fba01a70 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 25 Mar 2023 12:12:09 -0400 Subject: [PATCH 09/77] Update llama.cpp (llama_progress_callback) --- examples/llama_cpp.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 214050855..b5f83baa2 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -40,6 +40,7 @@ class llama_token_data(Structure): llama_token_data_p = POINTER(llama_token_data) +llama_progress_callback = ctypes.CFUNCTYPE(None, c_double, c_void_p) class llama_context_params(Structure): _fields_ = [ @@ -54,6 +55,10 @@ class llama_context_params(Structure): ("vocab_only", c_bool), # only load the vocabulary, no weights ("use_mlock", c_bool), # force system to keep model in RAM ("embedding", c_bool), # embedding mode only + # called with a progress value between 0 and 1, pass NULL to disable + ("progress_callback", llama_progress_callback), + # context pointer passed to the progress callback + ("progress_callback_user_data", c_void_p), ] From a279acd680db28d7fc00cf68f81ee45c2b9dd3ef Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 25 Mar 2023 16:26:03 -0400 Subject: [PATCH 10/77] Update llama.cpp (llama_n_embd) --- examples/llama_cpp.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index b5f83baa2..1862605b4 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -42,6 +42,7 @@ llama_token_data_p = POINTER(llama_token_data) llama_progress_callback = ctypes.CFUNCTYPE(None, c_double, c_void_p) + class llama_context_params(Structure): _fields_ = [ ("n_ctx", c_int), # text context @@ -163,6 +164,14 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int +def llama_n_embd(ctx: llama_context_p) -> c_int: + return _lib.llama_n_ctx(ctx) + + +_lib.llama_n_embd.argtypes = [llama_context_p] +_lib.llama_n_embd.restype = c_int + + # Token logits obtained from the last call to llama_eval() # The logits for the last token are stored in the last row # Can be mutated in order to change the probabilities of the next token From a71cda6546661e233ece69cb02d6b43a07ddeeb4 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 28 Mar 2023 21:10:23 -0400 Subject: [PATCH 11/77] Update llama.cpp --- examples/llama_cpp.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 1862605b4..156139f71 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -3,7 +3,6 @@ import ctypes from ctypes import ( c_int, c_float, - c_double, c_char_p, c_void_p, c_bool, @@ -40,7 +39,7 @@ class llama_token_data(Structure): llama_token_data_p = POINTER(llama_token_data) -llama_progress_callback = ctypes.CFUNCTYPE(None, c_double, c_void_p) +llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) class llama_context_params(Structure): @@ -228,9 +227,9 @@ def llama_sample_top_p_top_k( last_n_tokens_data, # type: Array[llama_token] last_n_tokens_size: c_int, top_k: c_int, - top_p: c_double, - temp: c_double, - repeat_penalty: c_double, + top_p: c_float, + temp: c_float, + repeat_penalty: c_float, ) -> llama_token: return _lib.llama_sample_top_p_top_k( ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty @@ -242,9 +241,9 @@ _lib.llama_sample_top_p_top_k.argtypes = [ llama_token_p, c_int, c_int, - c_double, - c_double, - c_double, + c_float, + c_float, + c_float, ] _lib.llama_sample_top_p_top_k.restype = llama_token From 62ce167b22580e4b697be2e31e4f61a53fd10475 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 1 Apr 2023 13:02:10 -0400 Subject: [PATCH 12/77] Update low level api example --- examples/llama_cpp.py | 35 +++++++++++++++++++++++++++-- examples/low_level_api_llama_cpp.py | 10 ++++----- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 156139f71..03232560f 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -1,5 +1,4 @@ import ctypes - from ctypes import ( c_int, c_float, @@ -8,7 +7,9 @@ from ctypes import ( c_bool, POINTER, Structure, - Array + Array, + c_uint8, + c_size_t ) import pathlib from itertools import chain @@ -109,6 +110,36 @@ def llama_model_quantize( _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] _lib.llama_model_quantize.restype = c_int +# Returns the KV cache that will contain the context for the +# ongoing prediction with the model. +def llama_get_kv_cache(ctx: llama_context_p): + return _lib.llama_get_kv_cache(ctx) + +_lib.llama_get_kv_cache.argtypes = [llama_context_p] +_lib.llama_get_kv_cache.restype = POINTER(c_uint8) + +# Returns the size of the KV cache +def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t: + return _lib.llama_get_kv_cache_size(ctx) + +_lib.llama_get_kv_cache_size.argtypes = [llama_context_p] +_lib.llama_get_kv_cache_size.restype = c_size_t + +# Returns the number of tokens in the KV cache +def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: + return _lib.llama_get_kv_cache_token_count(ctx) + +_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] +_lib.llama_get_kv_cache_token_count.restype = c_int + + +# Sets the KV cache containing the current context for the model +def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int): + return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count) + +_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int] +_lib.llama_set_kv_cache.restype = None + # Run the llama inference to obtain the logits and probabilities for the next token. # tokens + n_tokens is the provided batch of new tokens to process diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py index 4a888c355..2a639aad5 100644 --- a/examples/low_level_api_llama_cpp.py +++ b/examples/low_level_api_llama_cpp.py @@ -35,7 +35,7 @@ remaining_tokens = n_predict embd = [] last_n_size = 64 -last_n_tokens = [0] * last_n_size +last_n_tokens_data = [0] * last_n_size n_batch = 24 while remaining_tokens > 0: @@ -49,21 +49,21 @@ while remaining_tokens > 0: if len(embd_inp) <= input_consumed: id = llama_cpp.llama_sample_top_p_top_k( ctx, - (llama_cpp.c_int * len(last_n_tokens))(*last_n_tokens), - len(last_n_tokens), + (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data), + len(last_n_tokens_data), 40, 0.8, 0.2, 1.0 / 0.85, ) - last_n_tokens = last_n_tokens[1:] + [id] + last_n_tokens_data = last_n_tokens_data[1:] + [id] embd.append(id) input_noecho = False remaining_tokens -= 1 else: while len(embd_inp) > input_consumed: embd.append(embd_inp[input_consumed]) - last_n_tokens = last_n_tokens[1:] + [embd_inp[input_consumed]] + last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]] input_consumed += 1 if len(embd) >= n_batch: break From 2b8147e7a8881d91ec7da933262074101b44e30f Mon Sep 17 00:00:00 2001 From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com> Date: Sun, 2 Apr 2023 21:50:13 -0400 Subject: [PATCH 13/77] Update llama_cpp.py --- examples/llama_cpp.py | 48 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 03232560f..fe9a8934b 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -1,3 +1,5 @@ +import sys +import os import ctypes from ctypes import ( c_int, @@ -12,15 +14,47 @@ from ctypes import ( c_size_t ) import pathlib -from itertools import chain # Load the library -# TODO: fragile, should fix -_base_path = pathlib.Path(__file__).parent -(_lib_path,) = chain( - _base_path.glob("*.so"), _base_path.glob("*.dylib"), _base_path.glob("*.dll") -) -_lib = ctypes.CDLL(str(_lib_path)) +def _load_shared_library(lib_base_name): + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + lib_ext = ".so" + elif sys.platform == "darwin": + lib_ext = ".dylib" + elif sys.platform == "win32": + lib_ext = ".dll" + else: + raise RuntimeError("Unsupported platform") + + # Construct the paths to the possible shared library names + _base_path = pathlib.Path(__file__).parent.resolve() + # Searching for the library in the current directory under the name "libllama" (default name + # for llamacpp) and "llama" (default name for this repo) + _lib_paths = [ + _base_path / f"lib{lib_base_name}{lib_ext}", + _base_path / f"{lib_base_name}{lib_ext}" + ] + + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path)) + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") + + raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found") + +# Specify the base name of the shared library to load +_lib_base_name = "llama" + +# Load the library +_lib = _load_shared_library(_lib_base_name) # C types llama_context_p = c_void_p From 15bea0946b890a8e69deb739d790318ab4600ba8 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 3 Apr 2023 22:54:46 +0200 Subject: [PATCH 14/77] Chat llama.cpp example implementation --- examples/low_level_api_chat_cpp.py | 235 +++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 examples/low_level_api_chat_cpp.py diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py new file mode 100644 index 000000000..ec9703d1f --- /dev/null +++ b/examples/low_level_api_chat_cpp.py @@ -0,0 +1,235 @@ +""" +This is an example implementation of main.cpp from llama.cpp +Quirks: + * Its not exactly alike since this port is designed around programmatic I/O + * Input is always echoed if on, so it should be turned off when using "input()" + * The first antiprompt should be the userprompt like "\nUser:", + because its added when n_predict is reached (aka generation ended prematurely) + * n_predict can be set to -1 for unlimited length responses +""" +import llama_cpp + +def toIntArray(lst): + return [int(i) for i in lst] + +# A LLaMA interactive session +class LLaMAInteract: + def __init__(self, + primer: str="", + model: str="./models/30B/ggml-model-q4_0.bin", + n_ctx: int=1024, + seed: int=0, + n_threads: int=8, + antiprompt: list[str]=[], + input_echo: bool=True, + n_predict: int=20, + n_batch: int=8, + repeat_last_n: int=64, + top_k: int=50, + top_p: float=1., + temp: float=1.0, + repeat_penalty: float=1, + ) -> None: + # input args + self.n_threads = n_threads + self.input_echo = input_echo + self.n_predict = n_predict + self.n_batch = n_batch + self.repeat_last_n = repeat_last_n + self.top_k=top_k + self.top_p=top_p + self.temp=temp + self.repeat_penalty=repeat_penalty + self.n_ctx = n_ctx + self.seed = seed + + # runtime args + self.input_consumed = 0 + self.embd = [] + self.embd_inp = [] + self.n_past = 0 + self.first_antiprompt = [] + self.remaining_tokens = self.n_predict + self.output_echo = input_echo + + # model load + self.lparams = llama_cpp.llama_context_default_params() + self.lparams.n_ctx = self.n_ctx + self.lparams.seed = self.seed + self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams) + + # determine the required inference memory per token: + tmp = [0, 1, 2, 3] + llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads) + + # determine newline token + self.llama_token_newline = (llama_cpp.llama_token * 1)() + llama_cpp.llama_tokenize(self.ctx, b"\n", self.llama_token_newline, len(self.llama_token_newline), False) + self.llama_token_newline = toIntArray(self.llama_token_newline) + + # primer feed + if (len(primer) > 0): + self.input(primer) + self.n_keep = len(self.embd_inp) + + # create internal context + self.n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices + + # determine antiprompt tokens + for i in antiprompt: + d_antiprompt = (llama_cpp.llama_token * (len(i) + 1))() + n_antiprompt = llama_cpp.llama_tokenize(self.ctx, i.encode("utf8"), d_antiprompt, len(d_antiprompt), False) + self.first_antiprompt.append(toIntArray(d_antiprompt[:n_antiprompt])) + + # if an antiprompt is present + def use_antiprompt(self): + return len(self.first_antiprompt) > 0 + + def generate(self): + while self.remaining_tokens > 0 or self.use_antiprompt(): + # predict + if len(self.embd) > 0: + # infinite text generation via context swapping + # if we run out of context: + # - take the n_keep first tokens from the original prompt (via n_past) + # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch + if (self.n_past + len(self.embd) > self.n_ctx): + n_left = self.n_past - self.n_keep + self.n_past = self.n_keep + + # insert n_left/2 tokens at the start of embd from last_n_tokens + _insert = self.last_n_tokens[ + -(int(n_left/2) - len(self.embd)):-len(self.embd) + ] + self.embd[:len(_insert)] = _insert + #TODO: Still untested + + if (llama_cpp.llama_eval( + self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads + ) != 0): + raise Exception("Failed to llama_eval!") + + self.n_past += len(self.embd) + self.embd = [] + if len(self.embd_inp) <= self.input_consumed: + # out of user input, sample next token + _arr = self.last_n_tokens[-min(self.repeat_last_n, self.n_past):] + id = llama_cpp.llama_sample_top_p_top_k( + self.ctx, + (llama_cpp.llama_token * len(_arr))(*_arr), + len(_arr), + self.top_k, + self.top_p, + self.temp, + self.repeat_penalty, + ) + self.last_n_tokens.pop(0) + self.last_n_tokens.append(int(id)) + + # replace end of text token with newline token when in interactive mode + if (id == llama_cpp.llama_token_eos() and self.use_antiprompt()): + id = self.llama_token_newline[0] + # tokenize and inject first reverse prompt + self.embd_inp += self.first_antiprompt[0] + + # add it to the context + self.embd.append(int(id)) + + # echo this to console + self.output_echo = True + + # decrement remaining sampling budget + self.remaining_tokens -= 1 + else: + # output to console if input echo is on + self.output_echo = self.input_echo + + # some user input remains from prompt or interaction, forward it to processing + while len(self.embd_inp) > self.input_consumed: + self.embd.append(int(self.embd_inp[self.input_consumed])) + self.last_n_tokens.pop(0) + self.last_n_tokens.append(int(self.embd_inp[self.input_consumed])) + self.input_consumed += 1 + if len(self.embd) >= self.n_batch: + break + + # display tokens + if self.output_echo: + for id in self.embd: + yield id + + # if antiprompt is present, stop + if (self.use_antiprompt() and len(self.embd_inp) <= self.input_consumed): + for i in self.first_antiprompt: + if i == self.last_n_tokens[-len(i):]: + return + + # if end of generation + if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): + break + + # respect n_predict even if antiprompt is present + if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1): + self.embd_inp += self.first_antiprompt[0] + break + + def past(self): + for id in self.last_n_tokens[-self.n_past:]: + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + + def input(self, prompt: str): + embd_arr = (llama_cpp.llama_token * (len(prompt) + 1))() + n_of_tok = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), embd_arr, len(embd_arr), True) + self.embd_inp += toIntArray(embd_arr[:n_of_tok]) + + def output(self): + self.remaining_tokens = self.n_predict + for id in self.generate(): + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + +if __name__ == "__main__": + from datetime import datetime + + USER_NAME="User" + AI_NAME="ChatLLaMa" + + time_now = datetime.now() + prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What time is it? +{AI_NAME}: It is {time_now.strftime("%H:%M")}. +{USER_NAME}: What year is it? +{AI_NAME}: We are in {time_now.strftime("%Y")}. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: Name a color. +{AI_NAME}: Blue +{USER_NAME}:""" + + print("Loading model...") + ll = LLaMAInteract(prompt, + model="./models/30B/ggml-model-q4_0.bin", + n_ctx=2048, + antiprompt=[f"\n{USER_NAME}:"], + repeat_last_n=256, + n_predict=2048, + temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647 + ) + print("Loaded model!") + + for i in ll.output(): + print(i,end="",flush=True) + ll.input_echo = False + + inp = lambda x: f" {x}\n" + while True: + ll.input(inp(input(' '))) + for i in ll.output(): + print(i,end="",flush=True) \ No newline at end of file From 9e872410dae603d15b10cdf33fe62e9d51114c16 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Tue, 4 Apr 2023 11:48:48 +0200 Subject: [PATCH 15/77] Add instruction mode --- examples/low_level_api_chat_cpp.py | 99 +++++++++++++++++++----------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index ec9703d1f..8d4e8b692 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -5,24 +5,26 @@ Quirks: * Input is always echoed if on, so it should be turned off when using "input()" * The first antiprompt should be the userprompt like "\nUser:", because its added when n_predict is reached (aka generation ended prematurely) - * n_predict can be set to -1 for unlimited length responses + * n_predict can be set to -1 for unlimited length responses (or just a really high value) + * It's always in interactive mode, generation ends either by reaching an antiprompt + or running out of n_predict. + * Instruction mode adds its own antiprompt """ import llama_cpp -def toIntArray(lst): - return [int(i) for i in lst] - # A LLaMA interactive session class LLaMAInteract: def __init__(self, primer: str="", model: str="./models/30B/ggml-model-q4_0.bin", + instruct: bool=False, n_ctx: int=1024, seed: int=0, n_threads: int=8, antiprompt: list[str]=[], input_echo: bool=True, n_predict: int=20, + n_keep: int=0, n_batch: int=8, repeat_last_n: int=64, top_k: int=50, @@ -31,17 +33,17 @@ class LLaMAInteract: repeat_penalty: float=1, ) -> None: # input args + self.instruct = instruct self.n_threads = n_threads self.input_echo = input_echo self.n_predict = n_predict + self.n_keep = n_keep self.n_batch = n_batch self.repeat_last_n = repeat_last_n self.top_k=top_k self.top_p=top_p self.temp=temp self.repeat_penalty=repeat_penalty - self.n_ctx = n_ctx - self.seed = seed # runtime args self.input_consumed = 0 @@ -54,8 +56,8 @@ class LLaMAInteract: # model load self.lparams = llama_cpp.llama_context_default_params() - self.lparams.n_ctx = self.n_ctx - self.lparams.seed = self.seed + self.lparams.n_ctx = n_ctx + self.lparams.seed = seed self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams) # determine the required inference memory per token: @@ -63,29 +65,44 @@ class LLaMAInteract: llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads) # determine newline token - self.llama_token_newline = (llama_cpp.llama_token * 1)() - llama_cpp.llama_tokenize(self.ctx, b"\n", self.llama_token_newline, len(self.llama_token_newline), False) - self.llama_token_newline = toIntArray(self.llama_token_newline) + self.llama_token_newline = self._tokenize("\n", False) + self.inp_prefix = self._tokenize("\n\n### Instruction:\n\n") + self.inp_suffix = self._tokenize("\n\n### Response:\n\n", False) + + # add instruction as antiprompt + if (self.instruct): + self.first_antiprompt.append(self.inp_prefix) # primer feed if (len(primer) > 0): - self.input(primer) - self.n_keep = len(self.embd_inp) + self.embd_inp += self._tokenize(primer) + + # break immediately if using instruct + self.init_break = self.instruct + + # number of tokens to keep when resetting context + if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct): + self.n_keep = len(self.embd_inp) # create internal context - self.n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + self.n_ctx = llama_cpp.llama_n_ctx(self.ctx) self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices # determine antiprompt tokens for i in antiprompt: - d_antiprompt = (llama_cpp.llama_token * (len(i) + 1))() - n_antiprompt = llama_cpp.llama_tokenize(self.ctx, i.encode("utf8"), d_antiprompt, len(d_antiprompt), False) - self.first_antiprompt.append(toIntArray(d_antiprompt[:n_antiprompt])) + self.first_antiprompt.append(self._tokenize(i, False)) + + # tokenize a prompt + def _tokenize(self, prompt, bos=True): + _arr = (llama_cpp.llama_token * (len(prompt) + 1))() + _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos) + return _arr[:_n] # if an antiprompt is present def use_antiprompt(self): return len(self.first_antiprompt) > 0 + # generate tokens def generate(self): while self.remaining_tokens > 0 or self.use_antiprompt(): # predict @@ -125,16 +142,16 @@ class LLaMAInteract: self.repeat_penalty, ) self.last_n_tokens.pop(0) - self.last_n_tokens.append(int(id)) + self.last_n_tokens.append(id) # replace end of text token with newline token when in interactive mode - if (id == llama_cpp.llama_token_eos() and self.use_antiprompt()): + if (id == llama_cpp.llama_token_eos() and self.use_antiprompt() and not self.instruct): id = self.llama_token_newline[0] # tokenize and inject first reverse prompt self.embd_inp += self.first_antiprompt[0] # add it to the context - self.embd.append(int(id)) + self.embd.append(id) # echo this to console self.output_echo = True @@ -147,9 +164,9 @@ class LLaMAInteract: # some user input remains from prompt or interaction, forward it to processing while len(self.embd_inp) > self.input_consumed: - self.embd.append(int(self.embd_inp[self.input_consumed])) + self.embd.append(self.embd_inp[self.input_consumed]) self.last_n_tokens.pop(0) - self.last_n_tokens.append(int(self.embd_inp[self.input_consumed])) + self.last_n_tokens.append(self.embd_inp[self.input_consumed]) self.input_consumed += 1 if len(self.embd) >= self.n_batch: break @@ -159,11 +176,17 @@ class LLaMAInteract: for id in self.embd: yield id - # if antiprompt is present, stop - if (self.use_antiprompt() and len(self.embd_inp) <= self.input_consumed): - for i in self.first_antiprompt: - if i == self.last_n_tokens[-len(i):]: - return + if (len(self.embd_inp) <= self.input_consumed): + # if antiprompt is present, stop + if (self.use_antiprompt()): + for i in self.first_antiprompt: + if i == self.last_n_tokens[-len(i):]: + return + + # if we are using instruction mode, and we have processed the initial prompt + if (self.init_break): + self.init_break = False + break # if end of generation if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): @@ -174,15 +197,20 @@ class LLaMAInteract: self.embd_inp += self.first_antiprompt[0] break + # return past text def past(self): for id in self.last_n_tokens[-self.n_past:]: yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + # write input def input(self, prompt: str): - embd_arr = (llama_cpp.llama_token * (len(prompt) + 1))() - n_of_tok = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), embd_arr, len(embd_arr), True) - self.embd_inp += toIntArray(embd_arr[:n_of_tok]) + if (self.instruct): + self.embd_inp += self.inp_prefix + self.embd_inp += self._tokenize(prompt + "\n") + if (self.instruct): + self.embd_inp += self.inp_suffix + # write output def output(self): self.remaining_tokens = self.n_predict for id in self.generate(): @@ -214,7 +242,7 @@ The transcript only includes text, it does not include markup like HTML and Mark {USER_NAME}:""" print("Loading model...") - ll = LLaMAInteract(prompt, + m = LLaMAInteract(prompt, model="./models/30B/ggml-model-q4_0.bin", n_ctx=2048, antiprompt=[f"\n{USER_NAME}:"], @@ -224,12 +252,11 @@ The transcript only includes text, it does not include markup like HTML and Mark ) print("Loaded model!") - for i in ll.output(): + for i in m.output(): print(i,end="",flush=True) - ll.input_echo = False + m.input_echo = False - inp = lambda x: f" {x}\n" while True: - ll.input(inp(input(' '))) - for i in ll.output(): + m.input(" " + input('\n> ' if m.instruct else " ")) + for i in m.output(): print(i,end="",flush=True) \ No newline at end of file From 0bfad75406c8204a95a6bcc982d8ca351c9bbd7a Mon Sep 17 00:00:00 2001 From: Mug <> Date: Tue, 4 Apr 2023 16:18:26 +0200 Subject: [PATCH 16/77] Added instruction mode, fixed infinite generation, and various other fixes --- examples/low_level_api_chat_cpp.py | 62 +++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 8d4e8b692..45a6262d4 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -8,7 +8,9 @@ Quirks: * n_predict can be set to -1 for unlimited length responses (or just a really high value) * It's always in interactive mode, generation ends either by reaching an antiprompt or running out of n_predict. - * Instruction mode adds its own antiprompt + * Instruction mode adds its own antiprompt. + You should also still be feeding the model with a "primer" prompt that + shows it the expected format. """ import llama_cpp @@ -31,6 +33,8 @@ class LLaMAInteract: top_p: float=1., temp: float=1.0, repeat_penalty: float=1, + instruct_inp_prefix: str="\n\n### Instruction:\n\n", + instruct_inp_suffix: str="\n\n### Response:\n\n", ) -> None: # input args self.instruct = instruct @@ -66,12 +70,12 @@ class LLaMAInteract: # determine newline token self.llama_token_newline = self._tokenize("\n", False) - self.inp_prefix = self._tokenize("\n\n### Instruction:\n\n") - self.inp_suffix = self._tokenize("\n\n### Response:\n\n", False) + self.inp_prefix = self._tokenize(instruct_inp_prefix) + self.inp_suffix = self._tokenize(instruct_inp_suffix, False) # add instruction as antiprompt if (self.instruct): - self.first_antiprompt.append(self.inp_prefix) + self.first_antiprompt.append(self.inp_prefix.strip()) # primer feed if (len(primer) > 0): @@ -117,10 +121,9 @@ class LLaMAInteract: # insert n_left/2 tokens at the start of embd from last_n_tokens _insert = self.last_n_tokens[ - -(int(n_left/2) - len(self.embd)):-len(self.embd) + self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) ] - self.embd[:len(_insert)] = _insert - #TODO: Still untested + self.embd = _insert + self.embd if (llama_cpp.llama_eval( self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads @@ -197,6 +200,12 @@ class LLaMAInteract: self.embd_inp += self.first_antiprompt[0] break + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + llama_cpp.llama_free(self.ctx) + # return past text def past(self): for id in self.last_n_tokens[-self.n_past:]: @@ -206,7 +215,7 @@ class LLaMAInteract: def input(self, prompt: str): if (self.instruct): self.embd_inp += self.inp_prefix - self.embd_inp += self._tokenize(prompt + "\n") + self.embd_inp += self._tokenize(prompt) if (self.instruct): self.embd_inp += self.inp_suffix @@ -242,21 +251,38 @@ The transcript only includes text, it does not include markup like HTML and Mark {USER_NAME}:""" print("Loading model...") - m = LLaMAInteract(prompt, + with LLaMAInteract(prompt, model="./models/30B/ggml-model-q4_0.bin", n_ctx=2048, antiprompt=[f"\n{USER_NAME}:"], repeat_last_n=256, n_predict=2048, temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647 - ) - print("Loaded model!") + ) as m: + print("Loaded model!") - for i in m.output(): - print(i,end="",flush=True) - m.input_echo = False - - while True: - m.input(" " + input('\n> ' if m.instruct else " ")) for i in m.output(): - print(i,end="",flush=True) \ No newline at end of file + print(i,end="",flush=True) + m.input_echo = False + + def inp(): + out = "" + while (t := input()).endswith("\\"): + out += t[:-1] + "\n" + return out + t + "\n" + + while True: + if (m.instruct): + print('\n> ', end="") + m.input(inp()) + else: + print(f" ", end="") + m.input(f" {inp()}{AI_NAME}:") + print(f"{AI_NAME}: ",end="") + + try: + for i in m.output(): + print(i,end="",flush=True) + except KeyboardInterrupt: + print(f"\n{USER_NAME}:",end="") + m.input(f"\n{USER_NAME}:") From 3c1020b86697d9c3e0cfed8f10d4b4300ebe5d84 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Tue, 4 Apr 2023 16:20:27 +0200 Subject: [PATCH 17/77] Fix stripping instruction prompt --- examples/low_level_api_chat_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 45a6262d4..947be4ad0 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -75,7 +75,7 @@ class LLaMAInteract: # add instruction as antiprompt if (self.instruct): - self.first_antiprompt.append(self.inp_prefix.strip()) + self.first_antiprompt.append(self._tokenize(self.inp_prefix.strip())) # primer feed if (len(primer) > 0): From ae1f37f505d7e9061205394493f71f45a36712ea Mon Sep 17 00:00:00 2001 From: Mug <> Date: Tue, 4 Apr 2023 17:54:47 +0200 Subject: [PATCH 18/77] Fix repeating instructions and an antiprompt bug --- examples/low_level_api_chat_cpp.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 947be4ad0..eec2ff665 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -75,7 +75,7 @@ class LLaMAInteract: # add instruction as antiprompt if (self.instruct): - self.first_antiprompt.append(self._tokenize(self.inp_prefix.strip())) + self.first_antiprompt.append(self._tokenize(instruct_inp_prefix.strip(), False)) # primer feed if (len(primer) > 0): @@ -197,7 +197,8 @@ class LLaMAInteract: # respect n_predict even if antiprompt is present if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1): - self.embd_inp += self.first_antiprompt[0] + if not self.instruct: + self.embd_inp += self.first_antiprompt[0] break def __enter__(self): @@ -213,7 +214,7 @@ class LLaMAInteract: # write input def input(self, prompt: str): - if (self.instruct): + if (self.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix): self.embd_inp += self.inp_prefix self.embd_inp += self._tokenize(prompt) if (self.instruct): @@ -284,5 +285,6 @@ The transcript only includes text, it does not include markup like HTML and Mark for i in m.output(): print(i,end="",flush=True) except KeyboardInterrupt: - print(f"\n{USER_NAME}:",end="") - m.input(f"\n{USER_NAME}:") + if not m.instruct: + print(f"\n{USER_NAME}:",end="") + m.input(f"\n{USER_NAME}:") From 739e8d4c9bc268d556217c1a4b07818b542ac041 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Wed, 5 Apr 2023 14:47:24 +0200 Subject: [PATCH 19/77] Fix bug in init_break not being set when exited via antiprompt and others. --- examples/low_level_api_chat_cpp.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index eec2ff665..6003e0c62 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -33,6 +33,7 @@ class LLaMAInteract: top_p: float=1., temp: float=1.0, repeat_penalty: float=1, + init_break: bool=True, instruct_inp_prefix: str="\n\n### Instruction:\n\n", instruct_inp_suffix: str="\n\n### Response:\n\n", ) -> None: @@ -48,6 +49,7 @@ class LLaMAInteract: self.top_p=top_p self.temp=temp self.repeat_penalty=repeat_penalty + self.init_break = init_break # runtime args self.input_consumed = 0 @@ -81,9 +83,6 @@ class LLaMAInteract: if (len(primer) > 0): self.embd_inp += self._tokenize(primer) - # break immediately if using instruct - self.init_break = self.instruct - # number of tokens to keep when resetting context if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct): self.n_keep = len(self.embd_inp) @@ -182,13 +181,14 @@ class LLaMAInteract: if (len(self.embd_inp) <= self.input_consumed): # if antiprompt is present, stop if (self.use_antiprompt()): - for i in self.first_antiprompt: - if i == self.last_n_tokens[-len(i):]: - return + if True in [ + i == self.last_n_tokens[-len(i):] + for i in self.first_antiprompt + ]: + break # if we are using instruction mode, and we have processed the initial prompt if (self.init_break): - self.init_break = False break # if end of generation @@ -201,6 +201,8 @@ class LLaMAInteract: self.embd_inp += self.first_antiprompt[0] break + self.init_break = False + def __enter__(self): return self From ce66405da184891a8f03e7e2d908bd4ee2926efe Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 5 Apr 2023 04:17:26 -0400 Subject: [PATCH 20/77] Add quantize example --- examples/quantize.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 examples/quantize.py diff --git a/examples/quantize.py b/examples/quantize.py new file mode 100644 index 000000000..8bd03f88a --- /dev/null +++ b/examples/quantize.py @@ -0,0 +1,25 @@ +import os +import argparse +import llama_cpp + + +def main(args): + if not os.path.exists(fname_inp): + raise RuntimeError(f"Input file does not exist ({fname_inp})") + if os.path.exists(fname_out): + raise RuntimeError(f"Output file already exists ({fname_out})") + fname_inp = args.fname_inp.encode("utf-8") + fname_out = args.fname_out.encode("utf-8") + itype = args.itype + return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype) + if return_code != 0: + raise RuntimeError("Failed to quantize model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("fname_inp", type=str, help="Path to input model") + parser.add_argument("fname_out", type=str, help="Path to output model") + parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)") + args = parser.parse_args() + main(args) From 29e9fb66a3a09c0e744e6e82ab370ca509a90645 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Thu, 6 Apr 2023 15:30:57 +0200 Subject: [PATCH 21/77] Better llama.cpp interoperability Has some too many newline issues so WIP (Update) Fixed too many newlines, now onto args. Still needs shipping work so you could do "python -m llama_cpp.examples." etc. --- examples/common.py | 135 ++++++++++++ examples/low_level_api_chat_cpp.py | 342 +++++++++++++++++++---------- 2 files changed, 357 insertions(+), 120 deletions(-) create mode 100644 examples/common.py diff --git a/examples/common.py b/examples/common.py new file mode 100644 index 000000000..f80d995c5 --- /dev/null +++ b/examples/common.py @@ -0,0 +1,135 @@ +import os +import argparse + +from dataclasses import dataclass, field +from typing import List, Optional + +# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp + + +@dataclass +class GptParams: + seed: int = -1 + n_threads: int = min(4, os.cpu_count() or 1) + n_predict: int = 128 + repeat_last_n: int = 64 + n_parts: int = -1 + n_ctx: int = 512 + n_batch: int = 8 + n_keep: int = 0 + + top_k: int = 40 + top_p: float = 0.95 + temp: float = 0.80 + repeat_penalty: float = 1.10 + + model: str = "./models/llama-7B/ggml-model.bin" + prompt: str = "" + input_prefix: str = " " + fix_prefix: str = "" + output_postfix: str = "" + input_echo: bool = True, + + antiprompt: List[str] = field(default_factory=list) + + memory_f16: bool = True + random_prompt: bool = False + use_color: bool = False + interactive: bool = False + + embedding: bool = False + interactive_start: bool = False + + instruct: bool = False + ignore_eos: bool = False + perplexity: bool = False + use_mlock: bool = False + mem_test: bool = False + verbose_prompt: bool = False + + # Default instructions for Alpaca + # switch to "Human" and "Assistant" for Vicuna. + instruct_inp_prefix: str="\n\n### Instruction:\n\n", + instruct_inp_suffix: str="\n\n### Response:\n\n", + + +def gpt_params_parse(argv = None, params: Optional[GptParams] = None): + if params is None: + params = GptParams() + + parser = argparse.ArgumentParser() + parser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") + parser.add_argument("-s", "--seed", type=int, default=-1, help="",dest="seed") + parser.add_argument("-t", "--threads", type=int, default=1, help="",dest="n_threads") + parser.add_argument("-p", "--prompt", type=str, default="", help="",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="") + parser.add_argument("-c", "--ctx_size", type=int, default=512, help="",dest="n_ctx") + parser.add_argument("--memory_f32", action="store_false", help="",dest="memory_f16") + parser.add_argument("--top_p", type=float, default=0.9, help="",dest="top_p") + parser.add_argument("--temp", type=float, default=1.0, help="",dest="temp") + parser.add_argument("--repeat_last_n", type=int, default=64, help="",dest="repeat_last_n") + parser.add_argument("--repeat_penalty", type=float, default=1.0, help="",dest="repeat_penalty") + parser.add_argument("-b", "--batch_size", type=int, default=8, help="",dest="n_batch") + parser.add_argument("--keep", type=int, default=0, help="",dest="n_keep") + parser.add_argument("-m", "--model", type=str, help="",dest="model") + parser.add_argument( + "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + ) + parser.add_argument("--embedding", action="store_true", help="", dest="embedding") + parser.add_argument("--interactive-start", action="store_true", help="", dest="interactive_start") + parser.add_argument( + "--interactive-first", + action="store_true", + help="run in interactive mode and wait for input right away", + dest="interactive" + ) + parser.add_argument( + "-ins", + "--instruct", + action="store_true", + help="run in instruction mode (use with Alpaca or Vicuna models)", + dest="instruct" + ) + parser.add_argument( + "--color", + action="store_true", + help="colorise output to distinguish prompt and user input from generations", + dest="use_color" + ) + parser.add_argument("--mlock", action="store_true",dest="use_mlock") + parser.add_argument("--mtest", action="store_true",dest="mem_test") + parser.add_argument( + "-r", + "--reverse-prompt", + type=str, + action='append', + help="run in interactive mode and poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", + dest="antiprompt" + ) + parser.add_argument("--perplexity", action="store_true", help="", dest="perplexity") + parser.add_argument("--ignore-eos", action="store_true", help="", dest="ignore_eos") + parser.add_argument("--n_parts", type=int, default=-1, help="", dest="n_parts") + parser.add_argument("--random-prompt", action="store_true", help="", dest="random_prompt") + parser.add_argument("--in-prefix", type=str, default=" ", help="", dest="input_prefix") + parser.add_argument("--fix-prefix", type=str, default=" ", help="", dest="fix_prefix") + parser.add_argument("--out-postfix", type=str, default="", help="", dest="output_postfix") + parser.add_argument("--input-noecho", action="store_false", help="", dest="input_echo") + args = parser.parse_args(argv) + return args + +def gpt_random_prompt(rng): + return [ + "So", + "Once upon a time", + "When", + "The", + "After", + "If", + "import", + "He", + "She", + "They", + ][rng % 10] + +if __name__ == "__main__": + print(GptParams(gpt_params_parse())) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 6003e0c62..e7370c01f 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -12,102 +12,182 @@ Quirks: You should also still be feeding the model with a "primer" prompt that shows it the expected format. """ +import sys +from time import time +from os import cpu_count + import llama_cpp +from common import GptParams, gpt_params_parse, gpt_random_prompt + +ANSI_COLOR_RESET = "\x1b[0m" +ANSI_COLOR_YELLOW = "\x1b[33m" +ANSI_BOLD = "\x1b[1m" +ANSI_COLOR_GREEN = "\x1b[32m" + +CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET +CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW +CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN # A LLaMA interactive session class LLaMAInteract: - def __init__(self, - primer: str="", - model: str="./models/30B/ggml-model-q4_0.bin", - instruct: bool=False, - n_ctx: int=1024, - seed: int=0, - n_threads: int=8, - antiprompt: list[str]=[], - input_echo: bool=True, - n_predict: int=20, - n_keep: int=0, - n_batch: int=8, - repeat_last_n: int=64, - top_k: int=50, - top_p: float=1., - temp: float=1.0, - repeat_penalty: float=1, - init_break: bool=True, - instruct_inp_prefix: str="\n\n### Instruction:\n\n", - instruct_inp_suffix: str="\n\n### Response:\n\n", - ) -> None: + def __init__(self, params: GptParams) -> None: # input args - self.instruct = instruct - self.n_threads = n_threads - self.input_echo = input_echo - self.n_predict = n_predict - self.n_keep = n_keep - self.n_batch = n_batch - self.repeat_last_n = repeat_last_n - self.top_k=top_k - self.top_p=top_p - self.temp=temp - self.repeat_penalty=repeat_penalty - self.init_break = init_break + self.params = params + + if (self.params.perplexity): + raise NotImplementedError("""************ +please use the 'perplexity' tool for perplexity calculations +************""") + + if (self.params.embedding): + raise NotImplementedError("""************ +please use the 'embedding' tool for embedding calculations +************""") + + if (self.params.n_ctx > 2048): + print(f"""warning: model does not support \ +context sizes greater than 2048 tokens ({self.params.n_ctx} \ +specified) expect poor results""", file=sys.stderr) + + if (self.params.seed <= 0): + self.params.seed = int(time()) + + print(f"seed = {self.params.seed}", file=sys.stderr) + + if (self.params.random_prompt): + self.params.prompt = gpt_random_prompt(self.params.seed) # runtime args self.input_consumed = 0 self.embd = [] - self.embd_inp = [] self.n_past = 0 self.first_antiprompt = [] - self.remaining_tokens = self.n_predict - self.output_echo = input_echo + self.remaining_tokens = self.params.n_predict + self.output_echo = self.params.input_echo # model load self.lparams = llama_cpp.llama_context_default_params() - self.lparams.n_ctx = n_ctx - self.lparams.seed = seed - self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams) + self.lparams.n_ctx = self.params.n_ctx + self.lparams.n_parts = self.params.n_parts + self.lparams.seed = self.params.seed + self.lparams.memory_f16 = self.params.memory_f16 + self.lparams.use_mlock = self.params.use_mlock + + self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams) + if (self.ctx == 0): + raise RuntimeError(f"error: failed to load model '{self.params.model}'") + + print(file=sys.stderr) + print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ +| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) # determine the required inference memory per token: - tmp = [0, 1, 2, 3] - llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads) - - # determine newline token - self.llama_token_newline = self._tokenize("\n", False) - self.inp_prefix = self._tokenize(instruct_inp_prefix) - self.inp_suffix = self._tokenize(instruct_inp_suffix, False) - - # add instruction as antiprompt - if (self.instruct): - self.first_antiprompt.append(self._tokenize(instruct_inp_prefix.strip(), False)) - - # primer feed - if (len(primer) > 0): - self.embd_inp += self._tokenize(primer) - - # number of tokens to keep when resetting context - if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct): - self.n_keep = len(self.embd_inp) + if (self.params.mem_test): + tmp = [0, 1, 2, 3] + llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads) + llama_cpp.llama_print_timings(self.ctx) + self.exit() + return # create internal context self.n_ctx = llama_cpp.llama_n_ctx(self.ctx) - self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices + + # Add a space in front of the first character to match OG llama tokenizer behavior + self.params.prompt = " " + self.params.prompt + + # tokenize the prompt + self.embd_inp = self._tokenize(self.params.prompt) + + if (len(self.embd_inp) > self.params.n_ctx - 4): + raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") + + # number of tokens to keep when resetting context + if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): + self.params.n_keep = len(self.embd_inp) + + self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix) + self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) + + # in instruct mode, we inject a prefix and a suffix to each input by the user + if (self.params.instruct): + self.params.interactive_start = True + self.first_antiprompt.append(self._tokenize(self.params.instruct_inp_prefix.strip(), False)) + + # enable interactive mode if reverse prompt or interactive start is specified + if (len(self.params.antiprompt) != 0 or self.params.interactive_start): + self.params.interactive = True + + # determine newline token + self.llama_token_newline = self._tokenize("\n", False) + + if (self.params.verbose_prompt): + print(f""" +prompt: '{self.params.prompt}' +number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr) + + for i in range(len(self.embd_inp)): + print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr) + + if (self.params.n_keep > 0): + print("static prompt based on n_keep: '") + for i in range(self.params.n_keep): + print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr) + print("'", file=sys.stderr) + print(file=sys.stderr) + + if (self.params.interactive): + print("interactive mode on.", file=sys.stderr) + + if (len(self.params.antiprompt) > 0): + for antiprompt in self.params.antiprompt: + print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr) + + if len(self.params.input_prefix) > 0: + print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) + + print(f"""sampling: temp = {self.params.temp},\ +top_k = {self.params.top_k},\ +top_p = {self.params.top_p},\ +repeat_last_n = {self.params.repeat_last_n},\ +repeat_penalty = {self.params.repeat_penalty} + +generate: n_ctx = {self.n_ctx}, \ +n_batch = {self.params.n_batch}, \ +n_predict = {self.params.n_predict}, \ +n_keep = {self.params.n_keep} +""", file=sys.stderr) # determine antiprompt tokens - for i in antiprompt: + for i in self.params.antiprompt: self.first_antiprompt.append(self._tokenize(i, False)) + self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices + + if (params.interactive): + print("""== Running in interactive mode. == + - Press Ctrl+C to interject at any time. + - Press Return to return control to LLaMa. + - If you want to submit another line, end your input in '\\'. + +""", file=sys.stderr) + self.set_color(CONSOLE_COLOR_PROMPT) + # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * (len(prompt) + 1))() _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos) return _arr[:_n] - # if an antiprompt is present def use_antiprompt(self): return len(self.first_antiprompt) > 0 + def set_color(self, c): + if (self.params.use_color): + print(c, end="") + # generate tokens def generate(self): - while self.remaining_tokens > 0 or self.use_antiprompt(): + while self.remaining_tokens > 0 or self.params.interactive: # predict if len(self.embd) > 0: # infinite text generation via context swapping @@ -115,8 +195,8 @@ class LLaMAInteract: # - take the n_keep first tokens from the original prompt (via n_past) # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch if (self.n_past + len(self.embd) > self.n_ctx): - n_left = self.n_past - self.n_keep - self.n_past = self.n_keep + n_left = self.n_past - self.params.n_keep + self.n_past = self.params.n_keep # insert n_left/2 tokens at the start of embd from last_n_tokens _insert = self.last_n_tokens[ @@ -125,7 +205,7 @@ class LLaMAInteract: self.embd = _insert + self.embd if (llama_cpp.llama_eval( - self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads + self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads ) != 0): raise Exception("Failed to llama_eval!") @@ -133,24 +213,28 @@ class LLaMAInteract: self.embd = [] if len(self.embd_inp) <= self.input_consumed: # out of user input, sample next token - _arr = self.last_n_tokens[-min(self.repeat_last_n, self.n_past):] + + #TODO: self.params.ignore_eos + + _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):] id = llama_cpp.llama_sample_top_p_top_k( self.ctx, (llama_cpp.llama_token * len(_arr))(*_arr), len(_arr), - self.top_k, - self.top_p, - self.temp, - self.repeat_penalty, + self.params.top_k, + self.params.top_p, + self.params.temp, + self.params.repeat_penalty, ) self.last_n_tokens.pop(0) self.last_n_tokens.append(id) # replace end of text token with newline token when in interactive mode - if (id == llama_cpp.llama_token_eos() and self.use_antiprompt() and not self.instruct): + if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): id = self.llama_token_newline[0] - # tokenize and inject first reverse prompt - self.embd_inp += self.first_antiprompt[0] + if (self.use_antiprompt()): + # tokenize and inject first reverse prompt + self.embd_inp += self.first_antiprompt[0] # add it to the context self.embd.append(id) @@ -162,7 +246,7 @@ class LLaMAInteract: self.remaining_tokens -= 1 else: # output to console if input echo is on - self.output_echo = self.input_echo + self.output_echo = self.params.input_echo # some user input remains from prompt or interaction, forward it to processing while len(self.embd_inp) > self.input_consumed: @@ -170,7 +254,7 @@ class LLaMAInteract: self.last_n_tokens.pop(0) self.last_n_tokens.append(self.embd_inp[self.input_consumed]) self.input_consumed += 1 - if len(self.embd) >= self.n_batch: + if len(self.embd) >= self.params.n_batch: break # display tokens @@ -178,7 +262,11 @@ class LLaMAInteract: for id in self.embd: yield id - if (len(self.embd_inp) <= self.input_consumed): + # reset color to default if we there is no pending user input + if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): + self.set_color(CONSOLE_COLOR_DEFAULT) + + if (self.params.interactive and len(self.embd_inp) <= self.input_consumed): # if antiprompt is present, stop if (self.use_antiprompt()): if True in [ @@ -188,26 +276,36 @@ class LLaMAInteract: break # if we are using instruction mode, and we have processed the initial prompt - if (self.init_break): + if (self.n_past > 0 and self.params.interactive_start): break - # if end of generation + # end of text token if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): + if (not self.params.instruct): + for i in " [end of text]\n": + yield i break # respect n_predict even if antiprompt is present - if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1): - if not self.instruct: + if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): + # If we arent in instruction mode, fix the current generation by appending the antiprompt. + # Makes it so if chat ends prematurely you dont append the AI's text etc. + if not self.params.instruct: self.embd_inp += self.first_antiprompt[0] + self.n_remain = self.params.n_predict break - self.init_break = False + self.params.interactive_start = False def __enter__(self): return self def __exit__(self, type, value, tb): + self.exit() + + def exit(self): llama_cpp.llama_free(self.ctx) + self.set_color(CONSOLE_COLOR_DEFAULT) # return past text def past(self): @@ -216,18 +314,51 @@ class LLaMAInteract: # write input def input(self, prompt: str): - if (self.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix): + if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix): self.embd_inp += self.inp_prefix self.embd_inp += self._tokenize(prompt) - if (self.instruct): + if (self.params.instruct): self.embd_inp += self.inp_suffix # write output def output(self): - self.remaining_tokens = self.n_predict + self.remaining_tokens = self.params.n_predict for id in self.generate(): yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + # read user input + def read_input(self): + out = "" + while (t := input()).endswith("\\"): + out += t[:-1] + "\n" + return out + t + "\n" + + # interactive mode + def interact(self): + for i in self.output(): + print(i,end="",flush=True) + self.params.input_echo = False + + while self.params.interactive: + self.set_color(CONSOLE_COLOR_USER_INPUT) + if (self.params.instruct): + print('\n> ', end="") + self.input(self.read_input()) + else: + print(self.params.input_prefix, end="") + self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}") + print(self.params.output_postfix,end="") + self.set_color(CONSOLE_COLOR_DEFAULT) + + try: + for i in self.output(): + print(i,end="",flush=True) + except KeyboardInterrupt: + self.set_color(CONSOLE_COLOR_DEFAULT) + if not self.params.instruct: + print(self.params.fix_prefix,end="") + self.input(self.params.fix_prefix) + if __name__ == "__main__": from datetime import datetime @@ -252,41 +383,12 @@ The transcript only includes text, it does not include markup like HTML and Mark {USER_NAME}: Name a color. {AI_NAME}: Blue {USER_NAME}:""" + args = gpt_params_parse() + params = GptParams(args) - print("Loading model...") - with LLaMAInteract(prompt, - model="./models/30B/ggml-model-q4_0.bin", - n_ctx=2048, - antiprompt=[f"\n{USER_NAME}:"], - repeat_last_n=256, - n_predict=2048, - temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647 - ) as m: - print("Loaded model!") + if (args.file): + with open(args.file) as f: + params.prompt = f.read() - for i in m.output(): - print(i,end="",flush=True) - m.input_echo = False - - def inp(): - out = "" - while (t := input()).endswith("\\"): - out += t[:-1] + "\n" - return out + t + "\n" - - while True: - if (m.instruct): - print('\n> ', end="") - m.input(inp()) - else: - print(f" ", end="") - m.input(f" {inp()}{AI_NAME}:") - print(f"{AI_NAME}: ",end="") - - try: - for i in m.output(): - print(i,end="",flush=True) - except KeyboardInterrupt: - if not m.instruct: - print(f"\n{USER_NAME}:",end="") - m.input(f"\n{USER_NAME}:") + with LLaMAInteract() as m: + m.interact() From d5680144c52787e2aded7decefd2370063a8dfcb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Apr 2023 15:05:33 -0400 Subject: [PATCH 22/77] Bugfix: Wrong size of embeddings. Closes #47 --- examples/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index fe9a8934b..5f22f6b50 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -229,7 +229,7 @@ _lib.llama_n_ctx.restype = c_int def llama_n_embd(ctx: llama_context_p) -> c_int: - return _lib.llama_n_ctx(ctx) + return _lib.llama_n_embd(ctx) _lib.llama_n_embd.argtypes = [llama_context_p] From e19909249dc87566464e7468443bd2b90e22f8b3 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Fri, 7 Apr 2023 13:32:19 +0200 Subject: [PATCH 23/77] More interoperability to the original llama.cpp, and arguments now work --- examples/common.py | 79 +++++++++++++++++------------- examples/low_level_api_chat_cpp.py | 19 ++++--- 2 files changed, 55 insertions(+), 43 deletions(-) diff --git a/examples/common.py b/examples/common.py index f80d995c5..1758a2d1d 100644 --- a/examples/common.py +++ b/examples/common.py @@ -26,9 +26,6 @@ class GptParams: model: str = "./models/llama-7B/ggml-model.bin" prompt: str = "" input_prefix: str = " " - fix_prefix: str = "" - output_postfix: str = "" - input_echo: bool = True, antiprompt: List[str] = field(default_factory=list) @@ -47,41 +44,57 @@ class GptParams: mem_test: bool = False verbose_prompt: bool = False + file: str = None + + # If chat ended prematurely, append this to the conversation to fix it. + # Set to "\nUser:" etc. + # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" + fix_prefix: str = " " + output_postfix: str = "" + input_echo: bool = True, + # Default instructions for Alpaca # switch to "Human" and "Assistant" for Vicuna. - instruct_inp_prefix: str="\n\n### Instruction:\n\n", - instruct_inp_suffix: str="\n\n### Response:\n\n", + # TODO: TBD how they are gonna handle this upstream + instruct_inp_prefix: str="\n\n### Instruction:\n\n" + instruct_inp_suffix: str="\n\n### Response:\n\n" def gpt_params_parse(argv = None, params: Optional[GptParams] = None): if params is None: params = GptParams() - parser = argparse.ArgumentParser() - parser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") - parser.add_argument("-s", "--seed", type=int, default=-1, help="",dest="seed") - parser.add_argument("-t", "--threads", type=int, default=1, help="",dest="n_threads") - parser.add_argument("-p", "--prompt", type=str, default="", help="",dest="prompt") - parser.add_argument("-f", "--file", type=str, default=None, help="") - parser.add_argument("-c", "--ctx_size", type=int, default=512, help="",dest="n_ctx") - parser.add_argument("--memory_f32", action="store_false", help="",dest="memory_f16") - parser.add_argument("--top_p", type=float, default=0.9, help="",dest="top_p") - parser.add_argument("--temp", type=float, default=1.0, help="",dest="temp") - parser.add_argument("--repeat_last_n", type=int, default=64, help="",dest="repeat_last_n") - parser.add_argument("--repeat_penalty", type=float, default=1.0, help="",dest="repeat_penalty") - parser.add_argument("-b", "--batch_size", type=int, default=8, help="",dest="n_batch") - parser.add_argument("--keep", type=int, default=0, help="",dest="n_keep") - parser.add_argument("-m", "--model", type=str, help="",dest="model") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") + parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") + parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") + parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") + parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") + parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") + parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict") + parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") + parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") + parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") + parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") parser.add_argument( "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" ) parser.add_argument("--embedding", action="store_true", help="", dest="embedding") - parser.add_argument("--interactive-start", action="store_true", help="", dest="interactive_start") + parser.add_argument( + "--interactive-start", + action="store_true", + help="run in interactive mode", + dest="interactive" + ) parser.add_argument( "--interactive-first", action="store_true", help="run in interactive mode and wait for input right away", - dest="interactive" + dest="interactive_start" ) parser.add_argument( "-ins", @@ -96,24 +109,24 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): help="colorise output to distinguish prompt and user input from generations", dest="use_color" ) - parser.add_argument("--mlock", action="store_true",dest="use_mlock") - parser.add_argument("--mtest", action="store_true",dest="mem_test") + parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") parser.add_argument( "-r", "--reverse-prompt", type=str, action='append', - help="run in interactive mode and poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", + help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", dest="antiprompt" ) - parser.add_argument("--perplexity", action="store_true", help="", dest="perplexity") - parser.add_argument("--ignore-eos", action="store_true", help="", dest="ignore_eos") - parser.add_argument("--n_parts", type=int, default=-1, help="", dest="n_parts") - parser.add_argument("--random-prompt", action="store_true", help="", dest="random_prompt") - parser.add_argument("--in-prefix", type=str, default=" ", help="", dest="input_prefix") - parser.add_argument("--fix-prefix", type=str, default=" ", help="", dest="fix_prefix") - parser.add_argument("--out-postfix", type=str, default="", help="", dest="output_postfix") - parser.add_argument("--input-noecho", action="store_false", help="", dest="input_echo") + parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") + parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") + parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") + parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") + parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") + parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") + parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") args = parser.parse_args(argv) return args diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index e7370c01f..4badc6721 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -6,8 +6,6 @@ Quirks: * The first antiprompt should be the userprompt like "\nUser:", because its added when n_predict is reached (aka generation ended prematurely) * n_predict can be set to -1 for unlimited length responses (or just a really high value) - * It's always in interactive mode, generation ends either by reaching an antiprompt - or running out of n_predict. * Instruction mode adds its own antiprompt. You should also still be feeding the model with a "primer" prompt that shows it the expected format. @@ -59,7 +57,6 @@ specified) expect poor results""", file=sys.stderr) # runtime args self.input_consumed = 0 - self.embd = [] self.n_past = 0 self.first_antiprompt = [] self.remaining_tokens = self.params.n_predict @@ -74,7 +71,7 @@ specified) expect poor results""", file=sys.stderr) self.lparams.use_mlock = self.params.use_mlock self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams) - if (self.ctx == 0): + if (not self.ctx): raise RuntimeError(f"error: failed to load model '{self.params.model}'") print(file=sys.stderr) @@ -95,7 +92,13 @@ specified) expect poor results""", file=sys.stderr) # Add a space in front of the first character to match OG llama tokenizer behavior self.params.prompt = " " + self.params.prompt + # Load prompt file + if (self.params.file): + with open(self.params.file) as f: + self.params.prompt = f.read() + # tokenize the prompt + self.embd = [] self.embd_inp = self._tokenize(self.params.prompt) if (len(self.embd_inp) > self.params.n_ctx - 4): @@ -384,11 +387,7 @@ The transcript only includes text, it does not include markup like HTML and Mark {AI_NAME}: Blue {USER_NAME}:""" args = gpt_params_parse() - params = GptParams(args) + params = GptParams(**vars(args)) - if (args.file): - with open(args.file) as f: - params.prompt = f.read() - - with LLaMAInteract() as m: + with LLaMAInteract(params) as m: m.interact() From f25a81309e3aa9618d79efc53d436ba0a25a8000 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Apr 2023 22:45:55 -0400 Subject: [PATCH 24/77] Update model paths to be more clear they should point to file --- examples/low_level_api_llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py index 2a639aad5..b048c0ac8 100644 --- a/examples/low_level_api_llama_cpp.py +++ b/examples/low_level_api_llama_cpp.py @@ -9,7 +9,7 @@ N_THREADS = multiprocessing.cpu_count() prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" lparams = llama_cpp.llama_context_default_params() -ctx = llama_cpp.llama_init_from_file(b"models/ggml-alpaca-7b-q4.bin", lparams) +ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams) # determine the required inference memory per token: tmp = [0, 1, 2, 3] From b36c04c99e9a7885d9ecba25f7a00c8993b6d3cb Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 10 Apr 2023 16:35:38 +0200 Subject: [PATCH 25/77] Added iterative search to prevent instructions from being echoed, add ignore eos, add no-mmap, fixed 1 character echo too much bug --- examples/common.py | 3 +++ examples/low_level_api_chat_cpp.py | 36 ++++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/examples/common.py b/examples/common.py index 1758a2d1d..f16980ccb 100644 --- a/examples/common.py +++ b/examples/common.py @@ -40,6 +40,7 @@ class GptParams: instruct: bool = False ignore_eos: bool = False perplexity: bool = False + use_mmap: bool = True use_mlock: bool = False mem_test: bool = False verbose_prompt: bool = False @@ -110,7 +111,9 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): dest="use_color" ) parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") + parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") parser.add_argument( "-r", "--reverse-prompt", diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 4badc6721..cf4c99d6e 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -26,6 +26,25 @@ CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN +# Iterative search +# Actively searches and prevents a pattern from being returned +class IterSearch: + def __init__(self, pattern): + self.pattern = list(pattern) + self.buffer = [] + + def __call__(self, char): + self.buffer += [char] + + if (self.pattern[:len(self.buffer)] == self.buffer): + if (len(self.buffer) >= len(self.pattern)): + self.buffer.clear() + return [] + + _tmp = self.buffer[:] + self.buffer.clear() + return _tmp + # A LLaMA interactive session class LLaMAInteract: def __init__(self, params: GptParams) -> None: @@ -69,6 +88,7 @@ specified) expect poor results""", file=sys.stderr) self.lparams.seed = self.params.seed self.lparams.memory_f16 = self.params.memory_f16 self.lparams.use_mlock = self.params.use_mlock + self.lparams.use_mmap = self.params.use_mmap self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams) if (not self.ctx): @@ -114,7 +134,9 @@ specified) expect poor results""", file=sys.stderr) # in instruct mode, we inject a prefix and a suffix to each input by the user if (self.params.instruct): self.params.interactive_start = True - self.first_antiprompt.append(self._tokenize(self.params.instruct_inp_prefix.strip(), False)) + _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) + self.first_antiprompt.append(_ptn) + self.antiecho = IterSearch(_ptn) # enable interactive mode if reverse prompt or interactive start is specified if (len(self.params.antiprompt) != 0 or self.params.interactive_start): @@ -217,7 +239,9 @@ n_keep = {self.params.n_keep} if len(self.embd_inp) <= self.input_consumed: # out of user input, sample next token - #TODO: self.params.ignore_eos + if (self.params.ignore_eos): + logits = llama_cpp.llama_get_logits(self.ctx) + logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0) _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):] id = llama_cpp.llama_sample_top_p_top_k( @@ -263,7 +287,11 @@ n_keep = {self.params.n_keep} # display tokens if self.output_echo: for id in self.embd: - yield id + if self.params.instruct: + for r in self.antiecho(id): + yield r + else: + yield id # reset color to default if we there is no pending user input if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): @@ -279,7 +307,7 @@ n_keep = {self.params.n_keep} break # if we are using instruction mode, and we have processed the initial prompt - if (self.n_past > 0 and self.params.interactive_start): + if (self.params.interactive_start): break # end of text token From d1b35174773896cb3452f7875f7cacaddd486bf9 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Wed, 5 Apr 2023 14:23:01 +0200 Subject: [PATCH 26/77] Allow local llama library usage --- examples/llama_cpp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 5f22f6b50..8bc0b577b 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -29,9 +29,12 @@ def _load_shared_library(lib_base_name): # Construct the paths to the possible shared library names _base_path = pathlib.Path(__file__).parent.resolve() + _local_path = pathlib.Path.cwd() # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths = [ + _local_path / f"./lib{lib_base_name}{lib_ext}", + _local_path / f"./{lib_base_name}{lib_ext}", _base_path / f"lib{lib_base_name}{lib_ext}", _base_path / f"{lib_base_name}{lib_ext}" ] From c8b5d0b963c7339d9b3fa98ebc1a5a7b542a2ea7 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 10 Apr 2023 17:00:35 +0200 Subject: [PATCH 27/77] Use environment variable for library override --- examples/llama_cpp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 8bc0b577b..63e8e97bf 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -29,16 +29,16 @@ def _load_shared_library(lib_base_name): # Construct the paths to the possible shared library names _base_path = pathlib.Path(__file__).parent.resolve() - _local_path = pathlib.Path.cwd() # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths = [ - _local_path / f"./lib{lib_base_name}{lib_ext}", - _local_path / f"./{lib_base_name}{lib_ext}", _base_path / f"lib{lib_base_name}{lib_ext}", _base_path / f"{lib_base_name}{lib_ext}" ] + if ("LLAMA_CPP_LIB" in os.environ): + _lib_paths = [pathlib.Path(os.environ["LLAMA_CPP_LIB"]).resolve()] + # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) From 848b4021a35d6c5f2ef6e15b771d004687f1779a Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 10 Apr 2023 17:06:58 +0200 Subject: [PATCH 28/77] Better custom library debugging --- examples/llama_cpp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 63e8e97bf..89eca4bb7 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -37,6 +37,7 @@ def _load_shared_library(lib_base_name): ] if ("LLAMA_CPP_LIB" in os.environ): + lib_base_name = os.environ["LLAMA_CPP_LIB"] _lib_paths = [pathlib.Path(os.environ["LLAMA_CPP_LIB"]).resolve()] # Add the library directory to the DLL search path on Windows (if needed) From d0a7ce9abf690fe6f6ff77d31de8d4942f840787 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 10 Apr 2023 17:12:25 +0200 Subject: [PATCH 29/77] Make windows users happy (hopefully) --- examples/llama_cpp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 89eca4bb7..1611e1635 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -38,7 +38,9 @@ def _load_shared_library(lib_base_name): if ("LLAMA_CPP_LIB" in os.environ): lib_base_name = os.environ["LLAMA_CPP_LIB"] - _lib_paths = [pathlib.Path(os.environ["LLAMA_CPP_LIB"]).resolve()] + _lib = pathlib.Path(lib_base_name) + _base_path = _lib.parent.resolve() + _lib_paths = [_lib.resolve()] # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): From ce0ca60b5676e2dfdb892299b05d849a164b96c1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Apr 2023 22:01:33 -0400 Subject: [PATCH 30/77] Update llama.cpp (llama_mmap_supported) --- examples/llama_cpp.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 1611e1635..fa59f1a60 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -94,6 +94,7 @@ class llama_context_params(Structure): c_bool, ), # the llama_eval() call computes all logits, not just the last one ("vocab_only", c_bool), # only load the vocabulary, no weights + ("use_mmap", c_bool), # use mmap if possible ("use_mlock", c_bool), # force system to keep model in RAM ("embedding", c_bool), # embedding mode only # called with a progress value between 0 and 1, pass NULL to disable @@ -116,6 +117,17 @@ def llama_context_default_params() -> llama_context_params: _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params +def llama_mmap_supported() -> c_bool: + return _lib.llama_mmap_supported() + +_lib.llama_mmap_supported.argtypes = [] +_lib.llama_mmap_supported.restype = c_bool + +def llama_mlock_supported() -> c_bool: + return _lib.llama_mlock_supported() + +_lib.llama_mlock_supported.argtypes = [] +_lib.llama_mlock_supported.restype = c_bool # Various functions for loading a ggml llama model. # Allocate (almost) all memory needed for the model. From d595f330e203d45c0760714ef3ea8f56f2b7304a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 11 Apr 2023 11:59:03 -0400 Subject: [PATCH 31/77] Update llama.cpp --- examples/llama_cpp.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index fa59f1a60..c4df029c9 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -11,10 +11,11 @@ from ctypes import ( Structure, Array, c_uint8, - c_size_t + c_size_t, ) import pathlib + # Load the library def _load_shared_library(lib_base_name): # Determine the file extension based on the platform @@ -33,10 +34,10 @@ def _load_shared_library(lib_base_name): # for llamacpp) and "llama" (default name for this repo) _lib_paths = [ _base_path / f"lib{lib_base_name}{lib_ext}", - _base_path / f"{lib_base_name}{lib_ext}" + _base_path / f"{lib_base_name}{lib_ext}", ] - if ("LLAMA_CPP_LIB" in os.environ): + if "LLAMA_CPP_LIB" in os.environ: lib_base_name = os.environ["LLAMA_CPP_LIB"] _lib = pathlib.Path(lib_base_name) _base_path = _lib.parent.resolve() @@ -54,7 +55,10 @@ def _load_shared_library(lib_base_name): except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") - raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found") + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + # Specify the base name of the shared library to load _lib_base_name = "llama" @@ -106,6 +110,10 @@ class llama_context_params(Structure): llama_context_params_p = POINTER(llama_context_params) +LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) +LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors # Functions @@ -117,18 +125,23 @@ def llama_context_default_params() -> llama_context_params: _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params + def llama_mmap_supported() -> c_bool: return _lib.llama_mmap_supported() + _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool + def llama_mlock_supported() -> c_bool: return _lib.llama_mlock_supported() + _lib.llama_mlock_supported.argtypes = [] _lib.llama_mlock_supported.restype = c_bool + # Various functions for loading a ggml llama model. # Allocate (almost) all memory needed for the model. # Return NULL on failure @@ -162,33 +175,42 @@ def llama_model_quantize( _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] _lib.llama_model_quantize.restype = c_int + # Returns the KV cache that will contain the context for the # ongoing prediction with the model. def llama_get_kv_cache(ctx: llama_context_p): return _lib.llama_get_kv_cache(ctx) + _lib.llama_get_kv_cache.argtypes = [llama_context_p] _lib.llama_get_kv_cache.restype = POINTER(c_uint8) + # Returns the size of the KV cache def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_kv_cache_size(ctx) + _lib.llama_get_kv_cache_size.argtypes = [llama_context_p] _lib.llama_get_kv_cache_size.restype = c_size_t + # Returns the number of tokens in the KV cache def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: return _lib.llama_get_kv_cache_token_count(ctx) + _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int # Sets the KV cache containing the current context for the model -def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int): +def llama_set_kv_cache( + ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int +): return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count) + _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int] _lib.llama_set_kv_cache.restype = None From 3693449c079e8875934d3c57f1fbed744773b6f3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Apr 2023 14:29:00 -0400 Subject: [PATCH 32/77] Update llama.cpp --- examples/llama_cpp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index c4df029c9..935017ab1 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -114,6 +114,7 @@ LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4) # tok_embeddings.weight and output.weight are F16 # Functions From b6ce5133d9fa3015a44dc3b78c546cb8e5a34257 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 01:30:04 -0400 Subject: [PATCH 33/77] Add bindings for LoRA adapters. Closes #88 --- examples/llama_cpp.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 935017ab1..c2d1ace63 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -114,7 +114,9 @@ LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4) # tok_embeddings.weight and output.weight are F16 +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( + 4 +) # tok_embeddings.weight and output.weight are F16 # Functions @@ -177,6 +179,22 @@ _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] _lib.llama_model_quantize.restype = c_int +# Apply a LoRA adapter to a loaded model +# path_base_model is the path to a higher quality model to use as a base for +# the layers modified by the adapter. Can be NULL to use the current loaded model. +# The model needs to be reloaded before applying a new adapter, otherwise the adapter +# will be applied on top of the previous one +# Returns 0 on success +def llama_apply_lora_from_file( + ctx: llama_context_p, path_lora: bytes, path_base_model: bytes, n_threads: c_int +) -> c_int: + return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) + + +_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] +_lib.llama_apply_lora_from_file.restype = c_int + + # Returns the KV cache that will contain the context for the # ongoing prediction with the model. def llama_get_kv_cache(ctx: llama_context_p): From 8229410a4eae9996a8b4fced88d8aefbe002cf4a Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 10 Apr 2023 16:38:45 +0200 Subject: [PATCH 34/77] More reasonable defaults --- examples/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/common.py b/examples/common.py index f16980ccb..58a5688ba 100644 --- a/examples/common.py +++ b/examples/common.py @@ -50,7 +50,7 @@ class GptParams: # If chat ended prematurely, append this to the conversation to fix it. # Set to "\nUser:" etc. # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" - fix_prefix: str = " " + fix_prefix: str = "" output_postfix: str = "" input_echo: bool = True, From 81c4c10389a814598ba4fd2dbaadb032550e514f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 18 Apr 2023 23:44:46 -0400 Subject: [PATCH 35/77] Update type signature to allow for null pointer to be passed. --- examples/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index c2d1ace63..5e8a5c316 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -186,7 +186,7 @@ _lib.llama_model_quantize.restype = c_int # will be applied on top of the previous one # Returns 0 on success def llama_apply_lora_from_file( - ctx: llama_context_p, path_lora: bytes, path_base_model: bytes, n_threads: c_int + ctx: llama_context_p, path_lora: ctypes.c_char_p, path_base_model: ctypes.c_char_p, n_threads: c_int ) -> c_int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) From bdbaf5dc76ef3c793b8206b3b2cc1ae0bf671513 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Mon, 17 Apr 2023 14:45:28 +0200 Subject: [PATCH 36/77] Fixed end of text wrong type, and fix n_predict behaviour --- examples/common.py | 2 +- examples/low_level_api_chat_cpp.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/common.py b/examples/common.py index 58a5688ba..061ec3ae9 100644 --- a/examples/common.py +++ b/examples/common.py @@ -75,7 +75,7 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") - parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict") + parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index cf4c99d6e..4a87d7d6b 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -144,6 +144,7 @@ specified) expect poor results""", file=sys.stderr) # determine newline token self.llama_token_newline = self._tokenize("\n", False) + self.llama_token_eot = self._tokenize(" [end of text]\n", False) if (self.params.verbose_prompt): print(f""" @@ -203,16 +204,16 @@ n_keep = {self.params.n_keep} _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos) return _arr[:_n] - def use_antiprompt(self): - return len(self.first_antiprompt) > 0 - def set_color(self, c): if (self.params.use_color): print(c, end="") + def use_antiprompt(self): + return len(self.first_antiprompt) > 0 + # generate tokens def generate(self): - while self.remaining_tokens > 0 or self.params.interactive: + while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1: # predict if len(self.embd) > 0: # infinite text generation via context swapping @@ -313,7 +314,7 @@ n_keep = {self.params.n_keep} # end of text token if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): if (not self.params.instruct): - for i in " [end of text]\n": + for i in self.llama_token_eot: yield i break From fd64310276801578e7bfc848664a3b4405e58674 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Wed, 26 Apr 2023 14:37:06 +0200 Subject: [PATCH 37/77] Fix decode errors permanently --- examples/low_level_api_chat_cpp.py | 6 +++--- examples/low_level_api_llama_cpp.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 4a87d7d6b..7a932a36f 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -96,7 +96,7 @@ specified) expect poor results""", file=sys.stderr) print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ -| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) +| {llama_cpp.llama_print_system_info().decode('utf8', errors='ignore')}", file=sys.stderr) # determine the required inference memory per token: if (self.params.mem_test): @@ -342,7 +342,7 @@ n_keep = {self.params.n_keep} # return past text def past(self): for id in self.last_n_tokens[-self.n_past:]: - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") # write input def input(self, prompt: str): @@ -356,7 +356,7 @@ n_keep = {self.params.n_keep} def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") # read user input def read_input(self): diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py index b048c0ac8..4fb5a0366 100644 --- a/examples/low_level_api_llama_cpp.py +++ b/examples/low_level_api_llama_cpp.py @@ -70,7 +70,7 @@ while remaining_tokens > 0: if not input_noecho: for id in embd: print( - llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"), + llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"), end="", flush=True, ) From 5bbf40aa47b767013c692b315ab06da6d5d88a86 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 21 Apr 2023 17:40:27 -0400 Subject: [PATCH 38/77] Update llama.cpp --- examples/llama_cpp.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 5e8a5c316..0005e4290 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -117,6 +117,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( 4 ) # tok_embeddings.weight and output.weight are F16 +LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6) # except 1d tensors # Functions @@ -169,13 +171,14 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success +# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given def llama_model_quantize( - fname_inp: bytes, fname_out: bytes, itype: c_int + fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int ) -> c_int: - return _lib.llama_model_quantize(fname_inp, fname_out, itype) + return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) -_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] +_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] _lib.llama_model_quantize.restype = c_int From bf9f02d8eec049f7ab11d405aaac15a2df5b63d7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 22 Apr 2023 19:50:28 -0400 Subject: [PATCH 39/77] Update llama.cpp --- examples/llama_cpp.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 0005e4290..44ab04acf 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -189,7 +189,10 @@ _lib.llama_model_quantize.restype = c_int # will be applied on top of the previous one # Returns 0 on success def llama_apply_lora_from_file( - ctx: llama_context_p, path_lora: ctypes.c_char_p, path_base_model: ctypes.c_char_p, n_threads: c_int + ctx: llama_context_p, + path_lora: ctypes.c_char_p, + path_base_model: ctypes.c_char_p, + n_threads: c_int, ) -> c_int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) @@ -237,6 +240,36 @@ _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, _lib.llama_set_kv_cache.restype = None +# Returns the size in bytes of the state (rng, logits, embedding and kv_cache) +def llama_get_state_size(ctx: llama_context_p) -> c_size_t: + return _lib.llama_get_state_size(ctx) + + +_lib.llama_get_state_size.argtypes = [llama_context_p] +_lib.llama_get_state_size.restype = c_size_t + + +# Copies the state to the specified destination address. +# Destination needs to have allocated enough memory. +# Returns the number of bytes copied +def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t: + return _lib.llama_copy_state_data(ctx, dest) + + +_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_copy_state_data.restype = c_size_t + + +# Set the state reading from the specified address +# Returns the number of bytes read +def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t: + return _lib.llama_set_state_data(ctx, src) + + +_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_set_state_data.restype = c_size_t + + # Run the llama inference to obtain the logits and probabilities for the next token. # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls From 80c18cb66510d659f1d6b8e499da6ede8a972f57 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Apr 2023 09:30:10 -0400 Subject: [PATCH 40/77] Update llama.cpp (remove llama_get_kv_cache) --- examples/llama_cpp.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 44ab04acf..90f498aa5 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -201,25 +201,6 @@ _lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, _lib.llama_apply_lora_from_file.restype = c_int -# Returns the KV cache that will contain the context for the -# ongoing prediction with the model. -def llama_get_kv_cache(ctx: llama_context_p): - return _lib.llama_get_kv_cache(ctx) - - -_lib.llama_get_kv_cache.argtypes = [llama_context_p] -_lib.llama_get_kv_cache.restype = POINTER(c_uint8) - - -# Returns the size of the KV cache -def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t: - return _lib.llama_get_kv_cache_size(ctx) - - -_lib.llama_get_kv_cache_size.argtypes = [llama_context_p] -_lib.llama_get_kv_cache_size.restype = c_size_t - - # Returns the number of tokens in the KV cache def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -229,17 +210,6 @@ _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int -# Sets the KV cache containing the current context for the model -def llama_set_kv_cache( - ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int -): - return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count) - - -_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int] -_lib.llama_set_kv_cache.restype = None - - # Returns the size in bytes of the state (rng, logits, embedding and kv_cache) def llama_get_state_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_state_size(ctx) From 656190750d91740d468eafcbe3b53f7fd3d1c780 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 25 Apr 2023 19:03:41 -0400 Subject: [PATCH 41/77] Update llama.cpp --- examples/llama_cpp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 90f498aa5..7c2254015 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -119,6 +119,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( ) # tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors # Functions From 66ad132575ebaecb58a354a8a5f23af70d1865c0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 26 Apr 2023 20:00:54 -0400 Subject: [PATCH 42/77] Update llama.cpp --- examples/llama_cpp.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 7c2254015..6fbd393bb 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -120,6 +120,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors # Functions @@ -210,6 +212,12 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int +# Sets the current rng seed. +def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): + return _lib.llama_set_rng_seed(ctx, seed) + +_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] +_lib.llama_set_rng_seed.restype = None # Returns the size in bytes of the state (rng, logits, embedding and kv_cache) def llama_get_state_size(ctx: llama_context_p) -> c_size_t: From c8e6ac366a22c9a4c4268c4324735c267bfb6ab8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 28 Apr 2023 15:32:43 -0400 Subject: [PATCH 43/77] Update llama.cpp (llama_load_session_file) --- examples/llama_cpp.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 6fbd393bb..3ac6d6e29 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -212,13 +212,16 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int + # Sets the current rng seed. def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): return _lib.llama_set_rng_seed(ctx, seed) + _lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] _lib.llama_set_rng_seed.restype = None + # Returns the size in bytes of the state (rng, logits, embedding and kv_cache) def llama_get_state_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_state_size(ctx) @@ -249,6 +252,44 @@ _lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] _lib.llama_set_state_data.restype = c_size_t +# Save/load session file +def llama_load_session_file( + ctx: llama_context_p, + path_session: bytes, + tokens_out, + n_token_capacity: c_size_t, + n_token_count_out, +) -> c_size_t: + return _lib.llama_load_session_file( + ctx, path_session, tokens_out, n_token_capacity, n_token_count_out + ) + + +_lib.llama_load_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, + POINTER(c_size_t), +] +_lib.llama_load_session_file.restype = c_size_t + + +def llama_save_session_file( + ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t +) -> c_size_t: + return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) + + +_lib.llama_save_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, +] +_lib.llama_save_session_file.restype = c_size_t + + # Run the llama inference to obtain the logits and probabilities for the next token. # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls From 36b34943324da5fefa435263ad9739d9f9e78da9 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Wed, 26 Apr 2023 14:45:51 +0200 Subject: [PATCH 44/77] Also ignore errors on input prompts --- examples/low_level_api_chat_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 7a932a36f..2e24e8683 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -201,7 +201,7 @@ n_keep = {self.params.n_keep} # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * (len(prompt) + 1))() - _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos) + _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) return _arr[:_n] def set_color(self, c): From 441d30811accb7350bd6aee81d34d7ee4c8f3899 Mon Sep 17 00:00:00 2001 From: Mug <> Date: Fri, 28 Apr 2023 12:50:30 +0200 Subject: [PATCH 45/77] Detect multi-byte responses and wait --- examples/low_level_api_chat_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 2e24e8683..e046c2a79 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -96,7 +96,7 @@ specified) expect poor results""", file=sys.stderr) print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ -| {llama_cpp.llama_print_system_info().decode('utf8', errors='ignore')}", file=sys.stderr) +| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) # determine the required inference memory per token: if (self.params.mem_test): From d0031edbd2f5cb1559281465a40fe80ba04283b1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 10:44:28 -0400 Subject: [PATCH 46/77] Update llama.cpp --- examples/llama_cpp.py | 230 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 215 insertions(+), 15 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 3ac6d6e29..3b5e66047 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -67,6 +67,12 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types +LLAMA_FILE_VERSION = ctypes.c_int(1) +LLAMA_FILE_MAGIC = b"ggjt" +LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" +LLAMA_SESSION_MAGIC = b"ggsn" +LLAMA_SESSION_VERSION = ctypes.c_int(0) + llama_context_p = c_void_p @@ -77,13 +83,24 @@ llama_token_p = POINTER(llama_token) class llama_token_data(Structure): _fields_ = [ ("id", llama_token), # token id + ("logit", c_float), # log-odds of the token ("p", c_float), # probability of the token - ("plog", c_float), # log probability of the token ] llama_token_data_p = POINTER(llama_token_data) + +class llama_token_data_array(Structure): + _fields_ = [ + ("data", llama_token_data_p), + ("size", c_size_t), + ("sorted", c_bool), + ] + + +llama_token_data_array_p = POINTER(llama_token_data_array) + llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) @@ -118,7 +135,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( 4 ) # tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6) # except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors @@ -401,31 +418,214 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -# TODO: improve the last_n_tokens interface ? -def llama_sample_top_p_top_k( +def llama_token_nl() -> llama_token: + return _lib.llama_token_nl() + + +_lib.llama_token_nl.argtypes = [] +_lib.llama_token_nl.restype = llama_token + + +# Sampling functions +def llama_sample_repetition_penalty( ctx: llama_context_p, - last_n_tokens_data, # type: Array[llama_token] - last_n_tokens_size: c_int, - top_k: c_int, - top_p: c_float, - temp: c_float, - repeat_penalty: c_float, + candidates, + last_tokens_data, + last_tokens_size: c_int, + penalty: c_float, ) -> llama_token: - return _lib.llama_sample_top_p_top_k( - ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty + return _lib.llama_sample_repetition_penalty( + ctx, candidates, last_tokens_data, last_tokens_size, penalty ) -_lib.llama_sample_top_p_top_k.argtypes = [ +_lib.llama_sample_repetition_penalty.argtypes = [ llama_context_p, + llama_token_data_array_p, llama_token_p, c_int, - c_int, c_float, +] +_lib.llama_sample_repetition_penalty.restype = llama_token + + +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +def llama_sample_frequency_and_presence_penalties( + ctx: llama_context_p, + candidates, + last_tokens_data, + last_tokens_size: c_int, + alpha_frequency: c_float, + alpha_presence: c_float, +) -> llama_token: + return _lib.llama_sample_frequency_and_presence_penalties( + ctx, + candidates, + last_tokens_data, + last_tokens_size, + alpha_frequency, + alpha_presence, + ) + + +_lib.llama_sample_frequency_and_presence_penalties.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_token_p, + c_int, c_float, c_float, ] -_lib.llama_sample_top_p_top_k.restype = llama_token +_lib.llama_sample_frequency_and_presence_penalties.restype = llama_token + + +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_softmax(ctx: llama_context_p, candidates) -> llama_token: + return _lib.llama_sample_softmax(ctx, candidates) + + +_lib.llama_sample_softmax.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_softmax.restype = llama_token + + +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1); +def llama_sample_top_k( + ctx: llama_context_p, candidates, k: c_int, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) + + +_lib.llama_sample_top_k.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_int, + c_int, +] + + +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); +def llama_sample_top_p( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) + + +_lib.llama_sample_top_p.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_int, +] +_lib.llama_sample_top_p.restype = llama_token + + +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1); +def llama_sample_tail_free( + ctx: llama_context_p, candidates, z: c_float, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) + + +_lib.llama_sample_tail_free.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_int, +] +_lib.llama_sample_tail_free.restype = llama_token + + +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); +def llama_sample_typical( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_int +) -> llama_token: + return _lib.llama_sample_typical(ctx, candidates, p, min_keep) + + +_lib.llama_sample_typical.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_int, +] +_lib.llama_sample_typical.restype = llama_token + + +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); +def llama_sample_temperature( + ctx: llama_context_p, candidates, temp: c_float +) -> llama_token: + return _lib.llama_sample_temperature(ctx, candidates, temp) + + +_lib.llama_sample_temperature.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, +] +_lib.llama_sample_temperature.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); +def llama_sample_token_mirostat( + ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu +) -> llama_token: + return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + + +_lib.llama_sample_token_mirostat.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + c_int, + POINTER(c_float), +] +_lib.llama_sample_token_mirostat.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); +def llama_sample_token_mirostat_v2( + ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu +) -> llama_token: + return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + + +_lib.llama_sample_token_mirostat_v2.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + POINTER(c_float), +] +_lib.llama_sample_token_mirostat_v2.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token: + return _lib.llama_sample_token_greedy(ctx, candidates) + + +_lib.llama_sample_token_greedy.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token_greedy.restype = llama_token + + +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token: + return _lib.llama_sample_token(ctx, candidates) + + +_lib.llama_sample_token.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token.restype = llama_token # Performance information From 78531e5d055f24614fb5b0d1659ec935794c1765 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 14:02:06 -0400 Subject: [PATCH 47/77] Fix return types and import comments --- examples/llama_cpp.py | 72 +++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 3b5e66047..601ffc6c2 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -427,13 +427,16 @@ _lib.llama_token_nl.restype = llama_token # Sampling functions + + +# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, candidates, last_tokens_data, last_tokens_size: c_int, penalty: c_float, -) -> llama_token: +): return _lib.llama_sample_repetition_penalty( ctx, candidates, last_tokens_data, last_tokens_size, penalty ) @@ -446,10 +449,10 @@ _lib.llama_sample_repetition_penalty.argtypes = [ c_int, c_float, ] -_lib.llama_sample_repetition_penalty.restype = llama_token +_lib.llama_sample_repetition_penalty.restype = None -# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, candidates, @@ -457,7 +460,7 @@ def llama_sample_frequency_and_presence_penalties( last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, -) -> llama_token: +): return _lib.llama_sample_frequency_and_presence_penalties( ctx, candidates, @@ -476,11 +479,11 @@ _lib.llama_sample_frequency_and_presence_penalties.argtypes = [ c_float, c_float, ] -_lib.llama_sample_frequency_and_presence_penalties.restype = llama_token +_lib.llama_sample_frequency_and_presence_penalties.restype = None -# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_softmax(ctx: llama_context_p, candidates) -> llama_token: +# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +def llama_sample_softmax(ctx: llama_context_p, candidates): return _lib.llama_sample_softmax(ctx, candidates) @@ -488,13 +491,11 @@ _lib.llama_sample_softmax.argtypes = [ llama_context_p, llama_token_data_array_p, ] -_lib.llama_sample_softmax.restype = llama_token +_lib.llama_sample_softmax.restype = None -# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1); -def llama_sample_top_k( - ctx: llama_context_p, candidates, k: c_int, min_keep: c_int -) -> llama_token: +# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +def llama_sample_top_k(ctx: llama_context_p, candidates, k: c_int, min_keep: c_int): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -504,12 +505,11 @@ _lib.llama_sample_top_k.argtypes = [ c_int, c_int, ] +_lib.llama_sample_top_k.restype = None -# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); -def llama_sample_top_p( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_int -) -> llama_token: +# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +def llama_sample_top_p(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -519,13 +519,13 @@ _lib.llama_sample_top_p.argtypes = [ c_float, c_int, ] -_lib.llama_sample_top_p.restype = llama_token +_lib.llama_sample_top_p.restype = None -# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1); +# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, candidates, z: c_float, min_keep: c_int -) -> llama_token: +): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -535,13 +535,11 @@ _lib.llama_sample_tail_free.argtypes = [ c_float, c_int, ] -_lib.llama_sample_tail_free.restype = llama_token +_lib.llama_sample_tail_free.restype = None -# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); -def llama_sample_typical( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_int -) -> llama_token: +# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +def llama_sample_typical(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -551,13 +549,10 @@ _lib.llama_sample_typical.argtypes = [ c_float, c_int, ] -_lib.llama_sample_typical.restype = llama_token +_lib.llama_sample_typical.restype = None -# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); -def llama_sample_temperature( - ctx: llama_context_p, candidates, temp: c_float -) -> llama_token: +def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -566,10 +561,15 @@ _lib.llama_sample_temperature.argtypes = [ llama_token_data_array_p, c_float, ] -_lib.llama_sample_temperature.restype = llama_token +_lib.llama_sample_temperature.restype = None -# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); +# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu ) -> llama_token: @@ -587,7 +587,11 @@ _lib.llama_sample_token_mirostat.argtypes = [ _lib.llama_sample_token_mirostat.restype = llama_token -# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); +# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu ) -> llama_token: @@ -604,7 +608,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [ _lib.llama_sample_token_mirostat_v2.restype = llama_token -# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); +# @details Selects the token with the highest probability. def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -616,7 +620,7 @@ _lib.llama_sample_token_greedy.argtypes = [ _lib.llama_sample_token_greedy.restype = llama_token -# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); +# @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From c26e9bf1c1552bd076ae21bbbc1146ce7dc6d5ff Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 May 2023 14:47:55 -0400 Subject: [PATCH 48/77] Update sampling api --- examples/llama_cpp.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 601ffc6c2..4e4596ea7 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -495,7 +495,9 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -def llama_sample_top_k(ctx: llama_context_p, candidates, k: c_int, min_keep: c_int): +def llama_sample_top_k( + ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1) +): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -503,13 +505,15 @@ _lib.llama_sample_top_k.argtypes = [ llama_context_p, llama_token_data_array_p, c_int, - c_int, + c_size_t, ] _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -def llama_sample_top_p(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): +def llama_sample_top_p( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) +): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -517,14 +521,14 @@ _lib.llama_sample_top_p.argtypes = [ llama_context_p, llama_token_data_array_p, c_float, - c_int, + c_size_t, ] _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( - ctx: llama_context_p, candidates, z: c_float, min_keep: c_int + ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -533,13 +537,15 @@ _lib.llama_sample_tail_free.argtypes = [ llama_context_p, llama_token_data_array_p, c_float, - c_int, + c_size_t, ] _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. -def llama_sample_typical(ctx: llama_context_p, candidates, p: c_float, min_keep: c_int): +def llama_sample_typical( + ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) +): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -547,7 +553,7 @@ _lib.llama_sample_typical.argtypes = [ llama_context_p, llama_token_data_array_p, c_float, - c_int, + c_size_t, ] _lib.llama_sample_typical.restype = None From d15578e63e5648373d42f04a31ca6e37055457ea Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 09:33:30 -0400 Subject: [PATCH 49/77] Update llama.cpp (session version) --- examples/llama_cpp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 4e4596ea7..5baa6cc76 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -71,7 +71,7 @@ LLAMA_FILE_VERSION = ctypes.c_int(1) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" -LLAMA_SESSION_VERSION = ctypes.c_int(0) +LLAMA_SESSION_VERSION = ctypes.c_int(1) llama_context_p = c_void_p @@ -239,7 +239,8 @@ _lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] _lib.llama_set_rng_seed.restype = None -# Returns the size in bytes of the state (rng, logits, embedding and kv_cache) +# Returns the maximum size in bytes of the state (rng, logits, embedding +# and kv_cache) - will often be smaller after compacting tokens def llama_get_state_size(ctx: llama_context_p) -> c_size_t: return _lib.llama_get_state_size(ctx) From 9e79465b215497409c5740f4285f1b508938ea93 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:05:31 -0400 Subject: [PATCH 50/77] Prefer explicit imports --- examples/llama_cpp.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 5baa6cc76..a56243dc9 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -67,11 +67,11 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = ctypes.c_int(1) +LLAMA_FILE_VERSION = c_int(1) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" -LLAMA_SESSION_VERSION = ctypes.c_int(1) +LLAMA_SESSION_VERSION = c_int(1) llama_context_p = c_void_p @@ -127,18 +127,18 @@ class llama_context_params(Structure): llama_context_params_p = POINTER(llama_context_params) -LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( +LLAMA_FTYPE_ALL_F32 = c_int(0) +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_3 = c_int(6) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors # Functions @@ -210,8 +210,8 @@ _lib.llama_model_quantize.restype = c_int # Returns 0 on success def llama_apply_lora_from_file( ctx: llama_context_p, - path_lora: ctypes.c_char_p, - path_base_model: ctypes.c_char_p, + path_lora: c_char_p, + path_base_model: c_char_p, n_threads: c_int, ) -> c_int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) From 32cf0133c9d494642e56abf0e7cb5ae0cceb0bcb Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Thu, 4 May 2023 18:33:08 +0200 Subject: [PATCH 51/77] Update low level examples --- examples/Chat.py | 70 ++++++++++ examples/Miku.py | 59 ++++++++ examples/ReasonAct.py | 49 +++++++ examples/common.py | 163 ++++++++++++++-------- examples/low_level_api_chat_cpp.py | 202 ++++++++++++++++++++++++---- examples/low_level_api_llama_cpp.py | 35 +++-- 6 files changed, 486 insertions(+), 92 deletions(-) create mode 100644 examples/Chat.py create mode 100644 examples/Miku.py create mode 100644 examples/ReasonAct.py diff --git a/examples/Chat.py b/examples/Chat.py new file mode 100644 index 000000000..9283fcb8a --- /dev/null +++ b/examples/Chat.py @@ -0,0 +1,70 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "USER") +N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) +N_THREAD = int(env_or_def("N_THREAD", "8")) + +today = datetime.datetime.today() +DATE_YEAR=today.strftime("%Y") +DATE_TIME=today.strftime("%H:%M") + +prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What year is it? +{AI_NAME}: We are in {DATE_YEAR}. +{USER_NAME}: Please tell me the largest city in Europe. +{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. +{USER_NAME}: What can you tell me about Moscow? +{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: How do I pass command line arguments to a Node.js program? +{AI_NAME}: The arguments are stored in process.argv. + + argv[0] is the path to the Node. js executable. + argv[1] is the path to the script file. + argv[2] is the first argument passed to the script. + argv[3] is the second argument passed to the script and so on. +{USER_NAME}: Name a color. +{AI_NAME}: Blue. +{USER_NAME}: What time is it? +{AI_NAME}: It is {DATE_TIME}. +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_ctx=2048, + temp=0.7, + top_k=40, + top_p=0.5, + repeat_last_n=256, + n_batch=1024, + repeat_penalty=1.17647, + model=MODEL, + n_threads=N_THREAD, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + input_prefix=" ", + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/Miku.py b/examples/Miku.py new file mode 100644 index 000000000..86fd1d74c --- /dev/null +++ b/examples/Miku.py @@ -0,0 +1,59 @@ +#!/bin/python +import sys, os +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "Miku") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "Anon") +N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) +N_THREAD = int(env_or_def("N_THREAD", "0")) + +prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. +{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between {USER_NAME} and {AI_NAME} +The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. +{AI_NAME} can only communicate through text, so she can't send images or videos. + + +{USER_NAME}: Hello! +{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! +{AI_NAME}: What do you like to do in your free time? ^_^ +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_batch=1024, + n_ctx=2048, + n_keep=-1, + repeat_last_n=256, + repeat_penalty=1.17647, + temp=0.7, + top_k=40, + top_p=0.5, + model=MODEL, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + prompt=prompt, +) + +if N_THREAD > 0: + params.n_threads = N_THREAD + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/ReasonAct.py b/examples/ReasonAct.py new file mode 100644 index 000000000..cf0a13747 --- /dev/null +++ b/examples/ReasonAct.py @@ -0,0 +1,49 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") + +prompt=f"""You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + interactive=True, + interactive_start=True, + top_k=10000, + temp=0.2, + repeat_penalty=1, + n_threads=7, + n_ctx=2048, + antiprompt=["Question:","Observation:"], + model=MODEL, + input_prefix=" ", + n_predict=-1, + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/common.py b/examples/common.py index 061ec3ae9..9a465db6e 100644 --- a/examples/common.py +++ b/examples/common.py @@ -1,8 +1,9 @@ import os import argparse +import re from dataclasses import dataclass, field -from typing import List, Optional +from typing import List # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp @@ -12,23 +13,35 @@ class GptParams: seed: int = -1 n_threads: int = min(4, os.cpu_count() or 1) n_predict: int = 128 - repeat_last_n: int = 64 n_parts: int = -1 n_ctx: int = 512 n_batch: int = 8 n_keep: int = 0 + ignore_eos: bool = False + logit_bias: dict[int, float] = field(default_factory=dict) top_k: int = 40 top_p: float = 0.95 + tfs_z: float = 1.00 + typical_p: float = 1.00 temp: float = 0.80 repeat_penalty: float = 1.10 + repeat_last_n: int = 64 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 + mirostat: int = 0 + mirostat_tau: float = 5.0 + mirostat_eta: float = 0.1 model: str = "./models/llama-7B/ggml-model.bin" prompt: str = "" + path_session: str = "" input_prefix: str = " " - antiprompt: List[str] = field(default_factory=list) + lora_adapter: str = "" + lora_base: str = "" + memory_f16: bool = True random_prompt: bool = False use_color: bool = False @@ -38,7 +51,7 @@ class GptParams: interactive_start: bool = False instruct: bool = False - ignore_eos: bool = False + penalize_nl: bool = True perplexity: bool = False use_mmap: bool = True use_mlock: bool = False @@ -61,59 +74,42 @@ class GptParams: instruct_inp_suffix: str="\n\n### Response:\n\n" -def gpt_params_parse(argv = None, params: Optional[GptParams] = None): - if params is None: - params = GptParams() - +def gpt_params_parse(argv = None): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") - parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") - parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") + parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") - parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") - parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") - parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") - parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") - parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") - parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") - parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + + parser.add_argument( + "-l", + "--logit-bias", + type=str, + action='append', + help="--logit-bias TOKEN_ID(+/-)BIAS", + dest="logit_bias_str" + ) + parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") + parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") + parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z") + parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") + parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") + parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") + parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") + parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") + parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau") + parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") + parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") - parser.add_argument( - "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" - ) - parser.add_argument("--embedding", action="store_true", help="", dest="embedding") - parser.add_argument( - "--interactive-start", - action="store_true", - help="run in interactive mode", - dest="interactive" - ) - parser.add_argument( - "--interactive-first", - action="store_true", - help="run in interactive mode and wait for input right away", - dest="interactive_start" - ) - parser.add_argument( - "-ins", - "--instruct", - action="store_true", - help="run in instruction mode (use with Alpaca or Vicuna models)", - dest="instruct" - ) - parser.add_argument( - "--color", - action="store_true", - help="colorise output to distinguish prompt and user input from generations", - dest="use_color" - ) - parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") - parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") - parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") - parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") + parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") parser.add_argument( "-r", "--reverse-prompt", @@ -122,16 +118,71 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", dest="antiprompt" ) - parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") - parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") - parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") + + parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter") + parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base") + + parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") - parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument( + "--color", + action="store_true", + help="colorise output to distinguish prompt and user input from generations", + dest="use_color" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + ) + + parser.add_argument("--embedding", action="store_true", help="", dest="embedding") + parser.add_argument( + "--interactive-first", + action="store_true", + help="run in interactive mode and wait for input right away", + dest="interactive_start" + ) + + parser.add_argument( + "-ins", + "--instruct", + action="store_true", + help="run in instruction mode (use with Alpaca or Vicuna models)", + dest="instruct" + ) + parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl") + parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") + parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") + parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") + parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + + #Custom args parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") + + parser.add_argument( + "--interactive-start", + action="store_true", + help="run in interactive mode", + dest="interactive" + ) + args = parser.parse_args(argv) - return args + + logit_bias_str = args.logit_bias_str + delattr(args, "logit_bias_str") + params = GptParams(**vars(args)) + + if (params.lora_adapter): + params.use_mmap = False + + if (logit_bias_str != None): + for i in logit_bias_str: + if (m := re.match(r"(\d+)([-+]\d+)", i)): + params.logit_bias[int(m.group(1))] = int(m.group(2)) + + return params def gpt_random_prompt(rng): return [ @@ -148,4 +199,4 @@ def gpt_random_prompt(rng): ][rng % 10] if __name__ == "__main__": - print(GptParams(gpt_params_parse())) + print(gpt_params_parse()) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index e046c2a79..d024f0860 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -10,9 +10,10 @@ Quirks: You should also still be feeding the model with a "primer" prompt that shows it the expected format. """ +import ctypes import sys from time import time -from os import cpu_count +from os import cpu_count, path import llama_cpp from common import GptParams, gpt_params_parse, gpt_random_prompt @@ -77,6 +78,7 @@ specified) expect poor results""", file=sys.stderr) # runtime args self.input_consumed = 0 self.n_past = 0 + self.n_session_consumed = 0 self.first_antiprompt = [] self.remaining_tokens = self.params.n_predict self.output_echo = self.params.input_echo @@ -94,6 +96,19 @@ specified) expect poor results""", file=sys.stderr) if (not self.ctx): raise RuntimeError(f"error: failed to load model '{self.params.model}'") + if (self.params.ignore_eos): + self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + + if (len(self.params.lora_adapter) > 0): + if (llama_cpp.llama_apply_lora_from_file( + self.ctx, + self.params.lora_adapter, + self.params.lora_base if len(self.params.lora_base) > 0 else None, + self.params.n_threads + ) != 0): + print("error: failed to apply lora adapter") + return + print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) @@ -117,13 +132,49 @@ specified) expect poor results""", file=sys.stderr) with open(self.params.file) as f: self.params.prompt = f.read() + self.session_tokens: list[llama_cpp.llama_token] = [] + if (len(self.params.path_session) > 0): + print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr) + + if (path.exists(self.params.path_session)): + _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp.c_int() + if (llama_cpp.llama_load_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + _session_tokens, + self.params.n_ctx, + ctypes.byref(_n_token_count_out) + ) != 0): + print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) + return + self.session_tokens = _session_tokens[:_n_token_count_out] + print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) + else: + print(f"session file does not exist, will create", file=sys.stderr) + # tokenize the prompt self.embd = [] self.embd_inp = self._tokenize(self.params.prompt) - if (len(self.embd_inp) > self.params.n_ctx - 4): + if (len(self.embd_inp) > self.n_ctx - 4): raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") + # debug message about similarity of saved session, if applicable + n_matching_session_tokens = 0 + if len(self.session_tokens) > 0: + for id in self.session_tokens: + if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]: + break + n_matching_session_tokens += 1 + + if n_matching_session_tokens >= len(self.embd_inp): + print(f"session file has exact match for prompt!") + elif n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + else: + print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + # number of tokens to keep when resetting context if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): self.params.n_keep = len(self.embd_inp) @@ -132,6 +183,7 @@ specified) expect poor results""", file=sys.stderr) self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) # in instruct mode, we inject a prefix and a suffix to each input by the user + self.antiecho = None if (self.params.instruct): self.params.interactive_start = True _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) @@ -171,16 +223,24 @@ number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr) if len(self.params.input_prefix) > 0: print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) - print(f"""sampling: temp = {self.params.temp},\ + print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ +repeat_penalty = {self.params.repeat_penalty},\ +presence_penalty = {self.params.presence_penalty},\ +frequency_penalty = {self.params.frequency_penalty},\ top_k = {self.params.top_k},\ +tfs_z = {self.params.tfs_z},\ top_p = {self.params.top_p},\ -repeat_last_n = {self.params.repeat_last_n},\ -repeat_penalty = {self.params.repeat_penalty} +typical_p = {self.params.typical_p},\ +temp = {self.params.temp},\ +mirostat = {self.params.mirostat},\ +mirostat_lr = {self.params.mirostat_eta},\ +mirostat_ent = {self.params.mirostat_tau},\ -generate: n_ctx = {self.n_ctx}, \ -n_batch = {self.params.n_batch}, \ -n_predict = {self.params.n_predict}, \ +generate: n_ctx = {self.n_ctx},\ +n_batch = {self.params.n_batch},\ +n_predict = {self.params.n_predict},\ n_keep = {self.params.n_keep} + """, file=sys.stderr) # determine antiprompt tokens @@ -198,6 +258,9 @@ n_keep = {self.params.n_keep} """, file=sys.stderr) self.set_color(CONSOLE_COLOR_PROMPT) + self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) + + # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * (len(prompt) + 1))() @@ -229,31 +292,117 @@ n_keep = {self.params.n_keep} self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) ] self.embd = _insert + self.embd + self.params.path_session = "" + + # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + # REVIEW + if self.n_session_consumed < len(self.session_tokens): + for i in range(len(self.embd)): + if self.embd[i] != self.session_tokens[self.n_session_consumed]: + self.session_tokens = self.session_tokens[:self.n_session_consumed] + break + + self.n_past += 1 + self.n_session_consumed += 1 + + if self.n_session_consumed >= len(self.session_tokens): + i += 1 + break + + if i > 0: + self.embd = self.embd[i:] + + # evaluate tokens in batches + # embd is typically prepared beforehand to fit within a batch, but not always + #TODO BUG: The batching code causes nonsensical generation + """for i in range(0, len(self.embd), self.params.n_batch): + n_eval = self.params.n_batch + _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + print(f"failed to eval") + return + + self.n_past += n_eval""" if (llama_cpp.llama_eval( self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads ) != 0): raise Exception("Failed to llama_eval!") + if len(self.embd) > 0 and not len(self.params.path_session) > 0: + self.session_tokens.extend(self.embd) + self.n_session_consumed = len(self.session_tokens) + self.n_past += len(self.embd) self.embd = [] - if len(self.embd_inp) <= self.input_consumed: + if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting # out of user input, sample next token + top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k + repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n - if (self.params.ignore_eos): - logits = llama_cpp.llama_get_logits(self.ctx) - logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0) + # optionally save the session on first sample (for faster prompt loading next time) + if len(self.params.path_session) > 0 and self.need_to_save_session: + self.need_to_save_session = False + llama_cpp.llama_save_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + self.session_tokens, + len(self.session_tokens) + ) + + id = 0 + + logits = llama_cpp.llama_get_logits(self.ctx) + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + + # Apply params.logit_bias map + for key, value in self.params.logit_bias.items(): + logits[key] += value + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + # Apply penalties + nl_logit = logits[llama_cpp.llama_token_nl()] + last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) + + _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) + llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, + _arr, + last_n_repeat, self.params.repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, + _arr, + last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty) + + if not self.params.penalize_nl: + logits[llama_cpp.llama_token_nl()] = nl_logit + + if self.params.temp <= 0: + # Greedy sampling + id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + else: + if self.params.mirostat == 1: + mirostat_mu = 2.0 * self.params.mirostat_tau + mirostat_m = 100 + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu) + elif self.params.mirostat == 2: + mirostat_mu = 2.0 * self.params.mirostat_tau + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu) + else: + # Temperature sampling + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + # print("`{}`".format(candidates_p.size)) - _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):] - id = llama_cpp.llama_sample_top_p_top_k( - self.ctx, - (llama_cpp.llama_token * len(_arr))(*_arr), - len(_arr), - self.params.top_k, - self.params.top_p, - self.params.temp, - self.params.repeat_penalty, - ) self.last_n_tokens.pop(0) self.last_n_tokens.append(id) @@ -288,7 +437,7 @@ n_keep = {self.params.n_keep} # display tokens if self.output_echo: for id in self.embd: - if self.params.instruct: + if self.antiecho != None: for r in self.antiecho(id): yield r else: @@ -316,7 +465,7 @@ n_keep = {self.params.n_keep} if (not self.params.instruct): for i in self.llama_token_eot: yield i - break + break # respect n_predict even if antiprompt is present if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): @@ -356,7 +505,7 @@ n_keep = {self.params.n_keep} def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") # read user input def read_input(self): @@ -415,8 +564,7 @@ The transcript only includes text, it does not include markup like HTML and Mark {USER_NAME}: Name a color. {AI_NAME}: Blue {USER_NAME}:""" - args = gpt_params_parse() - params = GptParams(**vars(args)) + params = gpt_params_parse() with LLaMAInteract(params) as m: m.interact() diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py index 4fb5a0366..9e38ec7cb 100644 --- a/examples/low_level_api_llama_cpp.py +++ b/examples/low_level_api_llama_cpp.py @@ -37,6 +37,10 @@ embd = [] last_n_size = 64 last_n_tokens_data = [0] * last_n_size n_batch = 24 +last_n_repeat = 64 +repeat_penalty = 1 +frequency_penalty = 0.0 +presence_penalty = 0.0 while remaining_tokens > 0: if len(embd) > 0: @@ -47,15 +51,28 @@ while remaining_tokens > 0: n_past += len(embd) embd = [] if len(embd_inp) <= input_consumed: - id = llama_cpp.llama_sample_top_p_top_k( - ctx, - (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data), - len(last_n_tokens_data), - 40, - 0.8, - 0.2, - 1.0 / 0.85, - ) + logits = llama_cpp.llama_get_logits(ctx) + n_vocab = llama_cpp.llama_n_vocab(ctx) + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p, + _arr, + last_n_repeat, repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p, + _arr, + last_n_repeat, frequency_penalty, presence_penalty) + + llama_cpp.llama_sample_top_k(ctx, candidates_p, 40) + llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8) + llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) + id = llama_cpp.llama_sample_token(ctx, candidates_p) + last_n_tokens_data = last_n_tokens_data[1:] + [id] embd.append(id) input_noecho = False From 335cd8d947cc2cf4608885629dc9e63eaa061150 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:18:25 +0200 Subject: [PATCH 52/77] Rename postfix to suffix to match upstream --- examples/Chat.py | 1 + examples/common.py | 4 ++-- examples/low_level_api_chat_cpp.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/Chat.py b/examples/Chat.py index 9283fcb8a..c78347168 100644 --- a/examples/Chat.py +++ b/examples/Chat.py @@ -63,6 +63,7 @@ params = GptParams( interactive=True, antiprompt=[f"{USER_NAME}:"], input_prefix=" ", + input_suffix=f"{AI_NAME}:", prompt=prompt, ) diff --git a/examples/common.py b/examples/common.py index 9a465db6e..75a952583 100644 --- a/examples/common.py +++ b/examples/common.py @@ -37,6 +37,7 @@ class GptParams: prompt: str = "" path_session: str = "" input_prefix: str = " " + input_suffix: str = "" antiprompt: List[str] = field(default_factory=list) lora_adapter: str = "" @@ -64,7 +65,6 @@ class GptParams: # Set to "\nUser:" etc. # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" fix_prefix: str = "" - output_postfix: str = "" input_echo: bool = True, # Default instructions for Alpaca @@ -110,6 +110,7 @@ def gpt_params_parse(argv = None): parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") parser.add_argument( "-r", "--reverse-prompt", @@ -158,7 +159,6 @@ def gpt_params_parse(argv = None): #Custom args parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") - parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") parser.add_argument( diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index d024f0860..d3a7d466f 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -527,8 +527,8 @@ n_keep = {self.params.n_keep} self.input(self.read_input()) else: print(self.params.input_prefix, end="") - self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}") - print(self.params.output_postfix,end="") + self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") + print(self.params.input_suffix,end="") self.set_color(CONSOLE_COLOR_DEFAULT) try: From bbf6848cb07b8fd73f80baa5d546eacd27f9c8b2 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:27:52 +0200 Subject: [PATCH 53/77] Wrong logit_bias parsed type --- examples/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/common.py b/examples/common.py index 75a952583..b51c28b16 100644 --- a/examples/common.py +++ b/examples/common.py @@ -180,7 +180,7 @@ def gpt_params_parse(argv = None): if (logit_bias_str != None): for i in logit_bias_str: if (m := re.match(r"(\d+)([-+]\d+)", i)): - params.logit_bias[int(m.group(1))] = int(m.group(2)) + params.logit_bias[int(m.group(1))] = float(m.group(2)) return params From f8ba031576fadd86601664a43916d4489387fa19 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Mon, 8 May 2023 15:27:42 +0200 Subject: [PATCH 54/77] Fix lora --- examples/low_level_api_chat_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index d3a7d466f..12f7e4510 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -102,8 +102,8 @@ specified) expect poor results""", file=sys.stderr) if (len(self.params.lora_adapter) > 0): if (llama_cpp.llama_apply_lora_from_file( self.ctx, - self.params.lora_adapter, - self.params.lora_base if len(self.params.lora_base) > 0 else None, + self.params.lora_adapter.encode("utf8"), + self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None, self.params.n_threads ) != 0): print("error: failed to apply lora adapter") From 0bf36a77aead8bdd2f73c3a960afb32bf10de916 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:35:50 +0200 Subject: [PATCH 55/77] Fix mirastat requiring c_float --- examples/low_level_api_chat_cpp.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 12f7e4510..c55ca2fdf 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -372,10 +372,10 @@ n_keep = {self.params.n_keep} _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, _arr, - last_n_repeat, self.params.repeat_penalty) + last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty)) llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, _arr, - last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty) + last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) if not self.params.penalize_nl: logits[llama_cpp.llama_token_nl()] = nl_logit @@ -387,19 +387,19 @@ n_keep = {self.params.n_keep} if self.params.mirostat == 1: mirostat_mu = 2.0 * self.params.mirostat_tau mirostat_m = 100 - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) - id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu)) elif self.params.mirostat == 2: mirostat_mu = 2.0 * self.params.mirostat_tau - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) - id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p) - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) From fb79c567d242f85b5d00e8b60f231a8560918250 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Mon, 8 May 2023 15:27:03 +0200 Subject: [PATCH 56/77] Fix session loading and saving in low level example chat --- examples/low_level_api_chat_cpp.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index c55ca2fdf..205a5b76c 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -138,16 +138,17 @@ specified) expect poor results""", file=sys.stderr) if (path.exists(self.params.path_session)): _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() - _n_token_count_out = llama_cpp.c_int() + _n_token_count_out = llama_cpp.c_size_t() if (llama_cpp.llama_load_session_file( self.ctx, self.params.path_session.encode("utf8"), _session_tokens, self.params.n_ctx, ctypes.byref(_n_token_count_out) - ) != 0): + ) != 1): print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) return + _n_token_count_out = _n_token_count_out.value self.session_tokens = _session_tokens[:_n_token_count_out] print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) else: @@ -161,19 +162,21 @@ specified) expect poor results""", file=sys.stderr) raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") # debug message about similarity of saved session, if applicable - n_matching_session_tokens = 0 + self.n_matching_session_tokens = 0 if len(self.session_tokens) > 0: for id in self.session_tokens: - if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]: + if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]: break - n_matching_session_tokens += 1 + self.n_matching_session_tokens += 1 - if n_matching_session_tokens >= len(self.embd_inp): + if self.n_matching_session_tokens >= len(self.embd_inp): print(f"session file has exact match for prompt!") - elif n_matching_session_tokens < (len(self.embd_inp) / 2): - print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + elif self.n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") else: - print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + + self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) # number of tokens to keep when resetting context if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): @@ -258,9 +261,6 @@ n_keep = {self.params.n_keep} """, file=sys.stderr) self.set_color(CONSOLE_COLOR_PROMPT) - self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) - - # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * (len(prompt) + 1))() @@ -329,7 +329,7 @@ n_keep = {self.params.n_keep} ) != 0): raise Exception("Failed to llama_eval!") - if len(self.embd) > 0 and not len(self.params.path_session) > 0: + if len(self.embd) > 0 and len(self.params.path_session) > 0: self.session_tokens.extend(self.embd) self.n_session_consumed = len(self.session_tokens) @@ -346,7 +346,7 @@ n_keep = {self.params.n_keep} llama_cpp.llama_save_session_file( self.ctx, self.params.path_session.encode("utf8"), - self.session_tokens, + (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens), len(self.session_tokens) ) From b5531e14350531943953846301c94c96f6ab2aca Mon Sep 17 00:00:00 2001 From: Don Mahurin <@> Date: Fri, 26 May 2023 06:35:15 -0700 Subject: [PATCH 57/77] low_level_api_chat_cpp.py: Fix missing antiprompt output in chat. --- examples/low_level_api_chat_cpp.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index 205a5b76c..e67cd8e43 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -409,12 +409,15 @@ n_keep = {self.params.n_keep} # replace end of text token with newline token when in interactive mode if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): id = self.llama_token_newline[0] + self.embd.append(id) if (self.use_antiprompt()): # tokenize and inject first reverse prompt self.embd_inp += self.first_antiprompt[0] - - # add it to the context - self.embd.append(id) + for id in self.first_antiprompt[0]: + self.embd.append(id) + else: + # add it to the context + self.embd.append(id) # echo this to console self.output_echo = True From a439fe15295657bf6cdc4d06a7d6cce92c8c6902 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 12 May 2023 14:28:22 -0400 Subject: [PATCH 58/77] Allow model to tokenize strings longer than context length and set add_bos. Closes #92 --- examples/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index a56243dc9..f2366effe 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -337,7 +337,7 @@ def llama_tokenize( tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, -) -> c_int: +) -> int: return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) From 731c71255b86000d956baf9ddd75992296796288 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 12:22:27 -0400 Subject: [PATCH 59/77] Add types for all low-level api functions --- examples/llama_cpp.py | 81 ++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index f2366effe..fce7fce1c 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -17,7 +17,7 @@ import pathlib # Load the library -def _load_shared_library(lib_base_name): +def _load_shared_library(lib_base_name: str): # Determine the file extension based on the platform if sys.platform.startswith("linux"): lib_ext = ".so" @@ -252,7 +252,9 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t: +def llama_copy_state_data( + ctx: llama_context_p, dest # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -262,7 +264,9 @@ _lib.llama_copy_state_data.restype = c_size_t # Set the state reading from the specified address # Returns the number of bytes read -def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t: +def llama_set_state_data( + ctx: llama_context_p, src # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_set_state_data(ctx, src) @@ -274,9 +278,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out, + tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, - n_token_count_out, + n_token_count_out, # type: Array[c_size_t] ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -294,7 +298,10 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( - ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t + ctx: llama_context_p, + path_session: bytes, + tokens, # type: Array[llama_token] + n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -433,8 +440,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates, - last_tokens_data, + candidates, # type: Array[llama_token_data] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, penalty: c_float, ): @@ -456,8 +463,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates, - last_tokens_data, + candidates, # type: Array[llama_token_data] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -484,7 +491,10 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax(ctx: llama_context_p, candidates): +def llama_sample_softmax( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +): return _lib.llama_sample_softmax(ctx, candidates) @@ -497,7 +507,10 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( - ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + k: c_int, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -513,7 +526,10 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + p: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -529,7 +545,10 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( - ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + z: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -545,7 +564,10 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + p: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -559,7 +581,11 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None -def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float): +def llama_sample_temperature( + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + temp: c_float +): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -578,7 +604,12 @@ _lib.llama_sample_temperature.restype = None # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( - ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + tau: c_float, + eta: c_float, + m: c_int, + mu # type: Array[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -600,7 +631,11 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( - ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + tau: c_float, + eta: c_float, + mu # type: Array[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -616,7 +651,10 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. -def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token: +def llama_sample_token_greedy( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -628,7 +666,10 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. -def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token: +def llama_sample_token( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From f20b34a3beb550761e11c2f0dee55ed755670a8c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:22:55 -0400 Subject: [PATCH 60/77] Add return type annotations for embeddings and logits --- examples/llama_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index fce7fce1c..e6638ed17 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -381,7 +381,7 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): +def llama_get_logits(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_logits(ctx) @@ -391,7 +391,7 @@ _lib.llama_get_logits.restype = POINTER(c_float) # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): +def llama_get_embeddings(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_embeddings(ctx) From 7862b520ec021f3fe76507e6857ebb1f677de6b7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 13:54:22 -0400 Subject: [PATCH 61/77] Fix llama_cpp types --- examples/llama_cpp.py | 60 +++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index e6638ed17..6b3994f13 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -8,6 +8,7 @@ from ctypes import ( c_void_p, c_bool, POINTER, + _Pointer, # type: ignore Structure, Array, c_uint8, @@ -252,9 +253,7 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] -) -> c_size_t: +def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -278,9 +277,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out, # type: Array[llama_token] + tokens_out: Array[llama_token], n_token_capacity: c_size_t, - n_token_count_out, # type: Array[c_size_t] + n_token_count_out: _Pointer[c_size_t], ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -300,7 +299,7 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( ctx: llama_context_p, path_session: bytes, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -321,7 +320,7 @@ _lib.llama_save_session_file.restype = c_size_t # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -440,8 +439,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - last_tokens_data, # type: Array[llama_token] + candidates: _Pointer[llama_token_data], + last_tokens_data: Array[llama_token], last_tokens_size: c_int, penalty: c_float, ): @@ -463,8 +462,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - last_tokens_data, # type: Array[llama_token] + candidates: _Pointer[llama_token_data], + last_tokens_data: Array[llama_token], last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -491,10 +490,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] -): +def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]): return _lib.llama_sample_softmax(ctx, candidates) @@ -508,9 +504,9 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], k: c_int, - min_keep: c_size_t = c_size_t(1) + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -527,9 +523,9 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], p: c_float, - min_keep: c_size_t = c_size_t(1) + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -546,9 +542,9 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], z: c_float, - min_keep: c_size_t = c_size_t(1) + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -565,9 +561,9 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], p: c_float, - min_keep: c_size_t = c_size_t(1) + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -582,9 +578,7 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - temp: c_float + ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -605,11 +599,11 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], tau: c_float, eta: c_float, m: c_int, - mu # type: Array[c_float] + mu: _Pointer[c_float], ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -632,10 +626,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], tau: c_float, eta: c_float, - mu # type: Array[c_float] + mu: _Pointer[c_float], ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -652,8 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -667,8 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From ff31330d7f6e2b6e6279ca9d00838f1723adff15 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:00:30 -0400 Subject: [PATCH 62/77] Fix candidates type --- examples/llama_cpp.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 6b3994f13..66bb82cf5 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -439,7 +439,7 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], last_tokens_data: Array[llama_token], last_tokens_size: c_int, penalty: c_float, @@ -462,7 +462,7 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], last_tokens_data: Array[llama_token], last_tokens_size: c_int, alpha_frequency: c_float, @@ -504,7 +504,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], k: c_int, min_keep: c_size_t = c_size_t(1), ): @@ -523,7 +523,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -542,7 +542,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], z: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -561,7 +561,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -578,7 +578,7 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -599,7 +599,7 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], tau: c_float, eta: c_float, m: c_int, @@ -626,7 +626,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], tau: c_float, eta: c_float, mu: _Pointer[c_float], @@ -646,7 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, candidates: _Pointer[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -660,7 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, candidates: _Pointer[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 0c2fb05361df1327c07f66f10c733a601a30f601 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:04:12 -0400 Subject: [PATCH 63/77] Fix: types --- examples/llama_cpp.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 66bb82cf5..30e8f47be 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -141,6 +141,11 @@ LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + # Functions @@ -257,7 +262,7 @@ def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_ return _lib.llama_copy_state_data(ctx, dest) -_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] _lib.llama_copy_state_data.restype = c_size_t @@ -269,7 +274,7 @@ def llama_set_state_data( return _lib.llama_set_state_data(ctx, src) -_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p] _lib.llama_set_state_data.restype = c_size_t @@ -291,7 +296,7 @@ _lib.llama_load_session_file.argtypes = [ c_char_p, llama_token_p, c_size_t, - POINTER(c_size_t), + c_size_t_p, ] _lib.llama_load_session_file.restype = c_size_t @@ -340,7 +345,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_max_tokens: c_int, add_bos: c_bool, ) -> int: @@ -385,7 +390,7 @@ def llama_get_logits(ctx: llama_context_p): # type: (...) -> Array[float] # typ _lib.llama_get_logits.argtypes = [llama_context_p] -_lib.llama_get_logits.restype = POINTER(c_float) +_lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input @@ -395,7 +400,7 @@ def llama_get_embeddings(ctx: llama_context_p): # type: (...) -> Array[float] # _lib.llama_get_embeddings.argtypes = [llama_context_p] -_lib.llama_get_embeddings.restype = POINTER(c_float) +_lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context @@ -614,7 +619,7 @@ _lib.llama_sample_token_mirostat.argtypes = [ c_float, c_float, c_int, - POINTER(c_float), + c_float_p, ] _lib.llama_sample_token_mirostat.restype = llama_token @@ -639,7 +644,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [ llama_token_data_array_p, c_float, c_float, - POINTER(c_float), + c_float_p, ] _lib.llama_sample_token_mirostat_v2.restype = llama_token From 4885e55ccdfdaa21e115c16fe42e0dd8e5e16339 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:12:26 -0400 Subject: [PATCH 64/77] Fix: runtime type errors --- examples/llama_cpp.py | 52 +++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 30e8f47be..62069a471 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -258,7 +258,9 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t: +def llama_copy_state_data( + ctx: llama_context_p, dest # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -282,9 +284,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out: Array[llama_token], + tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, - n_token_count_out: _Pointer[c_size_t], + n_token_count_out, # type: _Pointer[c_size_t] ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -304,7 +306,7 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( ctx: llama_context_p, path_session: bytes, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -325,7 +327,7 @@ _lib.llama_save_session_file.restype = c_size_t # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -345,7 +347,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, ) -> int: @@ -444,8 +446,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], - last_tokens_data: Array[llama_token], + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, penalty: c_float, ): @@ -467,8 +469,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], - last_tokens_data: Array[llama_token], + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -495,7 +497,9 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]): +def llama_sample_softmax( + ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] +): return _lib.llama_sample_softmax(ctx, candidates) @@ -509,7 +513,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] k: c_int, min_keep: c_size_t = c_size_t(1), ): @@ -528,7 +532,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -547,7 +551,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] z: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -566,7 +570,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -583,7 +587,9 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + temp: c_float, ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -604,11 +610,11 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] tau: c_float, eta: c_float, m: c_int, - mu: _Pointer[c_float], + mu, # type: _Pointer[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -631,10 +637,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] tau: c_float, eta: c_float, - mu: _Pointer[c_float], + mu, # type: _Pointer[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -651,7 +657,8 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -665,7 +672,8 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 690588410ef6f227ba069efe0f81e3b7baeedfe9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 19:30:14 -0400 Subject: [PATCH 65/77] Fix return type --- examples/llama_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 62069a471..72bc443e5 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -260,7 +260,7 @@ _lib.llama_get_state_size.restype = c_size_t # Returns the number of bytes copied def llama_copy_state_data( ctx: llama_context_p, dest # type: Array[c_uint8] -) -> c_size_t: +) -> int: return _lib.llama_copy_state_data(ctx, dest) @@ -272,7 +272,7 @@ _lib.llama_copy_state_data.restype = c_size_t # Returns the number of bytes read def llama_set_state_data( ctx: llama_context_p, src # type: Array[c_uint8] -) -> c_size_t: +) -> int: return _lib.llama_set_state_data(ctx, src) From 3808a73751f11bc92757bba77237f12e8d04b599 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 14 May 2023 22:08:11 -0400 Subject: [PATCH 66/77] Fix obscure Wndows DLL issue. Closes #208 --- examples/llama_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 72bc443e5..a0261b742 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -52,7 +52,7 @@ def _load_shared_library(lib_base_name: str): for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path)) + return ctypes.CDLL(str(_lib_path), winmode=0) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") From 59f80d2a0db7bfa77f97d711f2b47d2706c8681d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 03:04:22 -0400 Subject: [PATCH 67/77] Fix mlock_supported and mmap_supported return type --- examples/llama_cpp.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index a0261b742..2eb519380 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -157,7 +157,7 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params -def llama_mmap_supported() -> c_bool: +def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -165,7 +165,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool -def llama_mlock_supported() -> c_bool: +def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() @@ -387,7 +387,9 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore +def llama_get_logits( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_logits(ctx) @@ -397,7 +399,9 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore +def llama_get_embeddings( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_embeddings(ctx) From 7609c73ee6d939006bdd2ddc103975a5344e7216 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 00:12:47 -0400 Subject: [PATCH 68/77] Update llama.cpp (remove min_keep default value) --- examples/llama_cpp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 2eb519380..0ea37b6ee 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -519,7 +519,7 @@ def llama_sample_top_k( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] k: c_int, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -538,7 +538,7 @@ def llama_sample_top_p( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] p: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -557,7 +557,7 @@ def llama_sample_tail_free( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] z: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -576,7 +576,7 @@ def llama_sample_typical( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] p: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) From a83d11750762b9d0b9456400ae04daae7966b270 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 15 May 2023 09:15:01 -0400 Subject: [PATCH 69/77] Add winmode arg only on windows if python version supports it --- examples/llama_cpp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 0ea37b6ee..3d86a6150 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -44,15 +44,17 @@ def _load_shared_library(lib_base_name: str): _base_path = _lib.parent.resolve() _lib_paths = [_lib.resolve()] + cdll_args = dict() # type: ignore # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path), winmode=0) + return ctypes.CDLL(str(_lib_path), **cdll_args) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") From aae6c03e94d51ccde93d8412c71023ee2462b284 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 14 May 2023 00:04:22 -0400 Subject: [PATCH 70/77] Update llama.cpp --- examples/llama_cpp.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 3d86a6150..81435deeb 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -70,7 +70,7 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = c_int(1) +LLAMA_FILE_VERSION = c_int(2) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" @@ -111,6 +111,7 @@ class llama_context_params(Structure): _fields_ = [ ("n_ctx", c_int), # text context ("n_parts", c_int), # -1 for default + ("n_gpu_layers", c_int), # number of layers to store in VRAM ("seed", c_int), # RNG seed, 0 for random ("f16_kv", c_bool), # use fp16 for KV cache ( @@ -137,7 +138,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors # LLAMA_FTYPE_MOSTLY_Q4_3 = c_int(6) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors @@ -261,9 +262,9 @@ _lib.llama_get_state_size.restype = c_size_t # Destination needs to have allocated enough memory. # Returns the number of bytes copied def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] + ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: - return _lib.llama_copy_state_data(ctx, dest) + return _lib.llama_copy_state_data(ctx, dst) _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] From 66c27f31204039ade897dead8d23c24d5bda2bc9 Mon Sep 17 00:00:00 2001 From: Aneesh Joy Date: Wed, 17 May 2023 18:04:58 +0100 Subject: [PATCH 71/77] Fixd CUBLAS dll load issue in Windows --- examples/llama_cpp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 81435deeb..3ce1820e2 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -48,6 +48,7 @@ def _load_shared_library(lib_base_name: str): # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors From 601b19203f37d67c767f2c9126dffdcdead369ea Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 17 May 2023 15:26:38 -0400 Subject: [PATCH 72/77] Check for CUDA_PATH before adding --- examples/llama_cpp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 3ce1820e2..a8f90f861 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -48,7 +48,9 @@ def _load_shared_library(lib_base_name: str): # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors From fda33ddbd510485a889cc4b3d43a51fa4438cbca Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 19 May 2023 11:59:33 -0400 Subject: [PATCH 73/77] Fix llama_cpp and Llama type signatures. Closes #221 --- examples/llama_cpp.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index a8f90f861..6bddadff3 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -206,7 +206,7 @@ _lib.llama_free.restype = None # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given def llama_model_quantize( fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int -) -> c_int: +) -> int: return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) @@ -225,7 +225,7 @@ def llama_apply_lora_from_file( path_lora: c_char_p, path_base_model: c_char_p, n_threads: c_int, -) -> c_int: +) -> int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) @@ -234,7 +234,7 @@ _lib.llama_apply_lora_from_file.restype = c_int # Returns the number of tokens in the KV cache -def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: +def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -253,7 +253,7 @@ _lib.llama_set_rng_seed.restype = None # Returns the maximum size in bytes of the state (rng, logits, embedding # and kv_cache) - will often be smaller after compacting tokens -def llama_get_state_size(ctx: llama_context_p) -> c_size_t: +def llama_get_state_size(ctx: llama_context_p) -> int: return _lib.llama_get_state_size(ctx) @@ -293,7 +293,7 @@ def llama_load_session_file( tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, n_token_count_out, # type: _Pointer[c_size_t] -) -> c_size_t: +) -> int: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out ) @@ -314,7 +314,7 @@ def llama_save_session_file( path_session: bytes, tokens, # type: Array[llama_token] n_token_count: c_size_t, -) -> c_size_t: +) -> int: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -337,7 +337,7 @@ def llama_eval( n_tokens: c_int, n_past: c_int, n_threads: c_int, -) -> c_int: +) -> int: return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) @@ -364,7 +364,7 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, _lib.llama_tokenize.restype = c_int -def llama_n_vocab(ctx: llama_context_p) -> c_int: +def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -372,7 +372,7 @@ _lib.llama_n_vocab.argtypes = [llama_context_p] _lib.llama_n_vocab.restype = c_int -def llama_n_ctx(ctx: llama_context_p) -> c_int: +def llama_n_ctx(ctx: llama_context_p) -> int: return _lib.llama_n_ctx(ctx) @@ -380,7 +380,7 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int -def llama_n_embd(ctx: llama_context_p) -> c_int: +def llama_n_embd(ctx: llama_context_p) -> int: return _lib.llama_n_embd(ctx) @@ -426,7 +426,7 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens -def llama_token_bos() -> llama_token: +def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -434,7 +434,7 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token -def llama_token_eos() -> llama_token: +def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -442,7 +442,7 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -def llama_token_nl() -> llama_token: +def llama_token_nl() -> int: return _lib.llama_token_nl() @@ -625,7 +625,7 @@ def llama_sample_token_mirostat( eta: c_float, m: c_int, mu, # type: _Pointer[c_float] -) -> llama_token: +) -> int: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -651,7 +651,7 @@ def llama_sample_token_mirostat_v2( tau: c_float, eta: c_float, mu, # type: _Pointer[c_float] -) -> llama_token: +) -> int: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -669,7 +669,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token def llama_sample_token_greedy( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] -) -> llama_token: +) -> int: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -684,7 +684,7 @@ _lib.llama_sample_token_greedy.restype = llama_token def llama_sample_token( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] -) -> llama_token: +) -> int: return _lib.llama_sample_token(ctx, candidates) From 60a7c76339c8f2866bba17a07a4014cd98be60ce Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 21 May 2023 17:47:21 -0400 Subject: [PATCH 74/77] Update llama.cpp --- examples/llama_cpp.py | 219 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 178 insertions(+), 41 deletions(-) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 6bddadff3..7c27e3948 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -72,31 +72,61 @@ _lib_base_name = "llama" # Load the library _lib = _load_shared_library(_lib_base_name) -# C types -LLAMA_FILE_VERSION = c_int(2) -LLAMA_FILE_MAGIC = b"ggjt" -LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" -LLAMA_SESSION_MAGIC = b"ggsn" +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# llama.h bindings + +# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define LLAMA_FILE_VERSION 3 +LLAMA_FILE_VERSION = c_int(3) +LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT +LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML +LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_VERSION = c_int(1) +# struct llama_context; llama_context_p = c_void_p +# typedef int llama_token; llama_token = c_int llama_token_p = POINTER(llama_token) +# typedef struct llama_token_data { +# llama_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } llama_token_data; class llama_token_data(Structure): _fields_ = [ - ("id", llama_token), # token id - ("logit", c_float), # log-odds of the token - ("p", c_float), # probability of the token + ("id", llama_token), + ("logit", c_float), + ("p", c_float), ] llama_token_data_p = POINTER(llama_token_data) +# typedef struct llama_token_data_array { +# llama_token_data * data; +# size_t size; +# bool sorted; +# } llama_token_data_array; class llama_token_data_array(Structure): _fields_ = [ ("data", llama_token_data_p), @@ -107,54 +137,72 @@ class llama_token_data_array(Structure): llama_token_data_array_p = POINTER(llama_token_data_array) +# typedef void (*llama_progress_callback)(float progress, void *ctx); llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) +# struct llama_context_params { +# int n_ctx; // text context +# int n_gpu_layers; // number of layers to store in VRAM +# int seed; // RNG seed, -1 for random + +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the llama_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only + + +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; +# }; class llama_context_params(Structure): _fields_ = [ - ("n_ctx", c_int), # text context - ("n_parts", c_int), # -1 for default - ("n_gpu_layers", c_int), # number of layers to store in VRAM - ("seed", c_int), # RNG seed, 0 for random - ("f16_kv", c_bool), # use fp16 for KV cache + ("n_ctx", c_int), + ("n_gpu_layers", c_int), + ("seed", c_int), + ("f16_kv", c_bool), ( "logits_all", c_bool, - ), # the llama_eval() call computes all logits, not just the last one - ("vocab_only", c_bool), # only load the vocabulary, no weights - ("use_mmap", c_bool), # use mmap if possible - ("use_mlock", c_bool), # force system to keep model in RAM - ("embedding", c_bool), # embedding mode only - # called with a progress value between 0 and 1, pass NULL to disable + ), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), ("progress_callback", llama_progress_callback), - # context pointer passed to the progress callback ("progress_callback_user_data", c_void_p), ] llama_context_params_p = POINTER(llama_context_params) +# enum llama_ftype { +# LLAMA_FTYPE_ALL_F32 = 0, +# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# }; LLAMA_FTYPE_ALL_F32 = c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( - 4 -) # tok_embeddings.weight and output.weight are F16 -# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_3 = c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors - -# Misc -c_float_p = POINTER(c_float) -c_uint8_p = POINTER(c_uint8) -c_size_t_p = POINTER(c_size_t) - -# Functions +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) +# LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -163,6 +211,7 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params +# LLAMA_API bool llama_mmap_supported(); def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -171,6 +220,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool +# LLAMA_API bool llama_mlock_supported(); def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() @@ -179,9 +229,33 @@ _lib.llama_mlock_supported.argtypes = [] _lib.llama_mlock_supported.restype = c_bool -# Various functions for loading a ggml llama model. -# Allocate (almost) all memory needed for the model. -# Return NULL on failure +# // TODO: not great API - very likely to change +# // Initialize the llama + ggml backend +# // Call once at the start of the program +# LLAMA_API void llama_init_backend(); +def llama_init_backend(): + return _lib.llama_init_backend() + + +_lib.llama_init_backend.argtypes = [] +_lib.llama_init_backend.restype = None + + +# LLAMA_API int64_t llama_time_us(); +def llama_time_us() -> int: + return _lib.llama_time_us() + + +_lib.llama_time_us.argtypes = [] +_lib.llama_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml llama model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# LLAMA_API struct llama_context * llama_init_from_file( +# const char * path_model, +# struct llama_context_params params); def llama_init_from_file( path_model: bytes, params: llama_context_params ) -> llama_context_p: @@ -193,6 +267,7 @@ _lib.llama_init_from_file.restype = llama_context_p # Frees all allocated memory +# LLAMA_API void llama_free(struct llama_context * ctx); def llama_free(ctx: llama_context_p): return _lib.llama_free(ctx) @@ -204,6 +279,11 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +# LLAMA_API int llama_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# enum llama_ftype ftype, +# int nthread); def llama_model_quantize( fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int ) -> int: @@ -220,6 +300,11 @@ _lib.llama_model_quantize.restype = c_int # The model needs to be reloaded before applying a new adapter, otherwise the adapter # will be applied on top of the previous one # Returns 0 on success +# LLAMA_API int llama_apply_lora_from_file( +# struct llama_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); def llama_apply_lora_from_file( ctx: llama_context_p, path_lora: c_char_p, @@ -234,6 +319,7 @@ _lib.llama_apply_lora_from_file.restype = c_int # Returns the number of tokens in the KV cache +# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -243,6 +329,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int # Sets the current rng seed. +# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): return _lib.llama_set_rng_seed(ctx, seed) @@ -253,6 +340,7 @@ _lib.llama_set_rng_seed.restype = None # Returns the maximum size in bytes of the state (rng, logits, embedding # and kv_cache) - will often be smaller after compacting tokens +# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); def llama_get_state_size(ctx: llama_context_p) -> int: return _lib.llama_get_state_size(ctx) @@ -264,6 +352,7 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied +# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); def llama_copy_state_data( ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: @@ -276,6 +365,7 @@ _lib.llama_copy_state_data.restype = c_size_t # Set the state reading from the specified address # Returns the number of bytes read +# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); def llama_set_state_data( ctx: llama_context_p, src # type: Array[c_uint8] ) -> int: @@ -287,6 +377,7 @@ _lib.llama_set_state_data.restype = c_size_t # Save/load session file +# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); def llama_load_session_file( ctx: llama_context_p, path_session: bytes, @@ -309,6 +400,7 @@ _lib.llama_load_session_file.argtypes = [ _lib.llama_load_session_file.restype = c_size_t +# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); def llama_save_session_file( ctx: llama_context_p, path_session: bytes, @@ -331,6 +423,12 @@ _lib.llama_save_session_file.restype = c_size_t # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls # Returns 0 on success +# LLAMA_API int llama_eval( +# struct llama_context * ctx, +# const llama_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); def llama_eval( ctx: llama_context_p, tokens, # type: Array[llama_token] @@ -350,6 +448,12 @@ _lib.llama_eval.restype = c_int # Returns the number of tokens on success, no more than n_max_tokens # Returns a negative number on failure - the number of tokens that would have been returned # TODO: not sure if correct +# LLAMA_API int llama_tokenize( +# struct llama_context * ctx, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); def llama_tokenize( ctx: llama_context_p, text: bytes, @@ -364,6 +468,7 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, _lib.llama_tokenize.restype = c_int +# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -372,6 +477,7 @@ _lib.llama_n_vocab.argtypes = [llama_context_p] _lib.llama_n_vocab.restype = c_int +# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); def llama_n_ctx(ctx: llama_context_p) -> int: return _lib.llama_n_ctx(ctx) @@ -380,6 +486,7 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int +# LLAMA_API int llama_n_embd (const struct llama_context * ctx); def llama_n_embd(ctx: llama_context_p) -> int: return _lib.llama_n_embd(ctx) @@ -393,6 +500,7 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab +# LLAMA_API float * llama_get_logits(struct llama_context * ctx); def llama_get_logits( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -405,6 +513,7 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) +# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); def llama_get_embeddings( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -416,6 +525,7 @@ _lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -426,6 +536,7 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens +# LLAMA_API llama_token llama_token_bos(); def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -434,6 +545,7 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token +# LLAMA_API llama_token llama_token_eos(); def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -442,6 +554,7 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token +# LLAMA_API llama_token llama_token_nl(); def llama_token_nl() -> int: return _lib.llama_token_nl() @@ -454,6 +567,7 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); def llama_sample_repetition_penalty( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -477,6 +591,7 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -507,6 +622,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_softmax( ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] ): @@ -521,6 +637,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); def llama_sample_top_k( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -540,6 +657,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_top_p( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -559,6 +677,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); def llama_sample_tail_free( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -578,6 +697,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_typical( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -596,6 +716,7 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); def llama_sample_temperature( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -618,6 +739,7 @@ _lib.llama_sample_temperature.restype = None # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); def llama_sample_token_mirostat( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -645,6 +767,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); def llama_sample_token_mirostat_v2( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -666,6 +789,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token_greedy( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -681,6 +805,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -698,6 +823,7 @@ _lib.llama_sample_token.restype = llama_token # Performance information +# LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) @@ -706,6 +832,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p] _lib.llama_print_timings.restype = None +# LLAMA_API void llama_reset_timings(struct llama_context * ctx); def llama_reset_timings(ctx: llama_context_p): _lib.llama_reset_timings(ctx) @@ -715,9 +842,19 @@ _lib.llama_reset_timings.restype = None # Print system information +# LLAMA_API const char * llama_print_system_info(void); def llama_print_system_info() -> bytes: return _lib.llama_print_system_info() _lib.llama_print_system_info.argtypes = [] _lib.llama_print_system_info.restype = c_char_p + +################################################################################################### + + +_llama_initialized = False + +if not _llama_initialized: + llama_init_backend() + _llama_initialized = True From 4ad62c489d76b633480d8bfd1d1d2e974db67f1b Mon Sep 17 00:00:00 2001 From: Don Mahurin <@> Date: Mon, 22 May 2023 23:54:57 -0700 Subject: [PATCH 75/77] fix "missing 1 required positional argument: 'min_keep'" --- examples/low_level_api_chat_cpp.py | 8 ++++---- examples/low_level_api_llama_cpp.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py index e67cd8e43..5e8172434 100644 --- a/examples/low_level_api_chat_cpp.py +++ b/examples/low_level_api_chat_cpp.py @@ -395,10 +395,10 @@ n_keep = {self.params.n_keep} id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling - llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z),min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p),min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p),min_keep=llama_cpp.c_size_t(1)) llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py index 9e38ec7cb..2d1bab3f8 100644 --- a/examples/low_level_api_llama_cpp.py +++ b/examples/low_level_api_llama_cpp.py @@ -68,8 +68,8 @@ while remaining_tokens > 0: _arr, last_n_repeat, frequency_penalty, presence_penalty) - llama_cpp.llama_sample_top_k(ctx, candidates_p, 40) - llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8) + llama_cpp.llama_sample_top_k(ctx, candidates_p, 40, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, min_keep=llama_cpp.c_size_t(1)) llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) id = llama_cpp.llama_sample_token(ctx, candidates_p) From e5dad2afa06f702bc926675db65ad87426ed736b Mon Sep 17 00:00:00 2001 From: Don Mahurin <@> Date: Tue, 23 May 2023 06:21:31 -0700 Subject: [PATCH 76/77] Look for libllama in parent directory --- examples/llama_cpp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py index 7c27e3948..643c94bf5 100644 --- a/examples/llama_cpp.py +++ b/examples/llama_cpp.py @@ -31,10 +31,12 @@ def _load_shared_library(lib_base_name: str): # Construct the paths to the possible shared library names _base_path = pathlib.Path(__file__).parent.resolve() + _base_path_parent = pathlib.Path(__file__).parent.parent.resolve() # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths = [ _base_path / f"lib{lib_base_name}{lib_ext}", + _base_path_parent / f"lib{lib_base_name}{lib_ext}", _base_path / f"{lib_base_name}{lib_ext}", ] From 93278f84cf06b447cc4964e5d4435bf51af174f5 Mon Sep 17 00:00:00 2001 From: Don Mahurin <@> Date: Tue, 23 May 2023 06:21:31 -0700 Subject: [PATCH 77/77] low_level_api_chat_cpp.py: fix default path_prefix arg value to match class default value --- examples/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/common.py b/examples/common.py index b51c28b16..2a14917c5 100644 --- a/examples/common.py +++ b/examples/common.py @@ -108,7 +108,7 @@ def gpt_params_parse(argv = None): parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") - parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") + parser.add_argument("--session", type=str, default="", help="file to cache model state in (may be large!)",dest="path_session") parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") parser.add_argument(