Update llama.cpp
This commit is contained in:
parent
ce0ca60b56
commit
d595f330e2
1 changed files with 27 additions and 5 deletions
|
@ -11,10 +11,11 @@ from ctypes import (
|
||||||
Structure,
|
Structure,
|
||||||
Array,
|
Array,
|
||||||
c_uint8,
|
c_uint8,
|
||||||
c_size_t
|
c_size_t,
|
||||||
)
|
)
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
|
|
||||||
# Load the library
|
# Load the library
|
||||||
def _load_shared_library(lib_base_name):
|
def _load_shared_library(lib_base_name):
|
||||||
# Determine the file extension based on the platform
|
# Determine the file extension based on the platform
|
||||||
|
@ -33,10 +34,10 @@ def _load_shared_library(lib_base_name):
|
||||||
# for llamacpp) and "llama" (default name for this repo)
|
# for llamacpp) and "llama" (default name for this repo)
|
||||||
_lib_paths = [
|
_lib_paths = [
|
||||||
_base_path / f"lib{lib_base_name}{lib_ext}",
|
_base_path / f"lib{lib_base_name}{lib_ext}",
|
||||||
_base_path / f"{lib_base_name}{lib_ext}"
|
_base_path / f"{lib_base_name}{lib_ext}",
|
||||||
]
|
]
|
||||||
|
|
||||||
if ("LLAMA_CPP_LIB" in os.environ):
|
if "LLAMA_CPP_LIB" in os.environ:
|
||||||
lib_base_name = os.environ["LLAMA_CPP_LIB"]
|
lib_base_name = os.environ["LLAMA_CPP_LIB"]
|
||||||
_lib = pathlib.Path(lib_base_name)
|
_lib = pathlib.Path(lib_base_name)
|
||||||
_base_path = _lib.parent.resolve()
|
_base_path = _lib.parent.resolve()
|
||||||
|
@ -54,7 +55,10 @@ def _load_shared_library(lib_base_name):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
|
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
|
||||||
|
|
||||||
raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found")
|
raise FileNotFoundError(
|
||||||
|
f"Shared library with base name '{lib_base_name}' not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Specify the base name of the shared library to load
|
# Specify the base name of the shared library to load
|
||||||
_lib_base_name = "llama"
|
_lib_base_name = "llama"
|
||||||
|
@ -106,6 +110,10 @@ class llama_context_params(Structure):
|
||||||
|
|
||||||
llama_context_params_p = POINTER(llama_context_params)
|
llama_context_params_p = POINTER(llama_context_params)
|
||||||
|
|
||||||
|
LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
|
||||||
|
LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors
|
||||||
|
|
||||||
# Functions
|
# Functions
|
||||||
|
|
||||||
|
@ -117,18 +125,23 @@ def llama_context_default_params() -> llama_context_params:
|
||||||
_lib.llama_context_default_params.argtypes = []
|
_lib.llama_context_default_params.argtypes = []
|
||||||
_lib.llama_context_default_params.restype = llama_context_params
|
_lib.llama_context_default_params.restype = llama_context_params
|
||||||
|
|
||||||
|
|
||||||
def llama_mmap_supported() -> c_bool:
|
def llama_mmap_supported() -> c_bool:
|
||||||
return _lib.llama_mmap_supported()
|
return _lib.llama_mmap_supported()
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_mmap_supported.argtypes = []
|
_lib.llama_mmap_supported.argtypes = []
|
||||||
_lib.llama_mmap_supported.restype = c_bool
|
_lib.llama_mmap_supported.restype = c_bool
|
||||||
|
|
||||||
|
|
||||||
def llama_mlock_supported() -> c_bool:
|
def llama_mlock_supported() -> c_bool:
|
||||||
return _lib.llama_mlock_supported()
|
return _lib.llama_mlock_supported()
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_mlock_supported.argtypes = []
|
_lib.llama_mlock_supported.argtypes = []
|
||||||
_lib.llama_mlock_supported.restype = c_bool
|
_lib.llama_mlock_supported.restype = c_bool
|
||||||
|
|
||||||
|
|
||||||
# Various functions for loading a ggml llama model.
|
# Various functions for loading a ggml llama model.
|
||||||
# Allocate (almost) all memory needed for the model.
|
# Allocate (almost) all memory needed for the model.
|
||||||
# Return NULL on failure
|
# Return NULL on failure
|
||||||
|
@ -162,33 +175,42 @@ def llama_model_quantize(
|
||||||
_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
|
_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
|
||||||
_lib.llama_model_quantize.restype = c_int
|
_lib.llama_model_quantize.restype = c_int
|
||||||
|
|
||||||
|
|
||||||
# Returns the KV cache that will contain the context for the
|
# Returns the KV cache that will contain the context for the
|
||||||
# ongoing prediction with the model.
|
# ongoing prediction with the model.
|
||||||
def llama_get_kv_cache(ctx: llama_context_p):
|
def llama_get_kv_cache(ctx: llama_context_p):
|
||||||
return _lib.llama_get_kv_cache(ctx)
|
return _lib.llama_get_kv_cache(ctx)
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_get_kv_cache.argtypes = [llama_context_p]
|
_lib.llama_get_kv_cache.argtypes = [llama_context_p]
|
||||||
_lib.llama_get_kv_cache.restype = POINTER(c_uint8)
|
_lib.llama_get_kv_cache.restype = POINTER(c_uint8)
|
||||||
|
|
||||||
|
|
||||||
# Returns the size of the KV cache
|
# Returns the size of the KV cache
|
||||||
def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
|
def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
|
||||||
return _lib.llama_get_kv_cache_size(ctx)
|
return _lib.llama_get_kv_cache_size(ctx)
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
|
_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
|
||||||
_lib.llama_get_kv_cache_size.restype = c_size_t
|
_lib.llama_get_kv_cache_size.restype = c_size_t
|
||||||
|
|
||||||
|
|
||||||
# Returns the number of tokens in the KV cache
|
# Returns the number of tokens in the KV cache
|
||||||
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
|
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
|
||||||
return _lib.llama_get_kv_cache_token_count(ctx)
|
return _lib.llama_get_kv_cache_token_count(ctx)
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
|
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
|
||||||
_lib.llama_get_kv_cache_token_count.restype = c_int
|
_lib.llama_get_kv_cache_token_count.restype = c_int
|
||||||
|
|
||||||
|
|
||||||
# Sets the KV cache containing the current context for the model
|
# Sets the KV cache containing the current context for the model
|
||||||
def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
|
def llama_set_kv_cache(
|
||||||
|
ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int
|
||||||
|
):
|
||||||
return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
|
return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
|
_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
|
||||||
_lib.llama_set_kv_cache.restype = None
|
_lib.llama_set_kv_cache.restype = None
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue