diff --git a/expose.cpp b/expose.cpp index cd7f46a67..f473d8f5f 100644 --- a/expose.cpp +++ b/expose.cpp @@ -32,6 +32,7 @@ extern "C" { const int threads; const int max_context_length; const int batch_size; + const bool f16_kv; const char * model_filename; const int n_parts_overwrite = -1; }; @@ -75,7 +76,7 @@ extern "C" { ctx_params.n_ctx = inputs.max_context_length; ctx_params.n_parts = inputs.n_parts_overwrite; ctx_params.seed = -1; - ctx_params.f16_kv = true; + ctx_params.f16_kv = inputs.f16_kv; ctx_params.logits_all = false; ctx = llama_init_from_file(model.c_str(), ctx_params); diff --git a/llama_for_kobold.py b/llama_for_kobold.py index bdd875bdd..4fff65d35 100644 --- a/llama_for_kobold.py +++ b/llama_for_kobold.py @@ -12,6 +12,7 @@ class load_model_inputs(ctypes.Structure): _fields_ = [("threads", ctypes.c_int), ("max_context_length", ctypes.c_int), ("batch_size", ctypes.c_int), + ("f16_kv", ctypes.c_bool), ("model_filename", ctypes.c_char_p), ("n_parts_overwrite", ctypes.c_int)] @@ -43,8 +44,9 @@ def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwr inputs.model_filename = model_filename.encode("UTF-8") inputs.batch_size = batch_size inputs.max_context_length = max_context_length #initial value to use for ctx, can be overwritten - inputs.threads = os.cpu_count() + inputs.threads = 4 #seems to outperform os.cpu_count(), it's memory bottlenecked inputs.n_parts_overwrite = n_parts_overwrite + inputs.f16_kv = False ret = handle.load_model(inputs) return ret diff --git a/llamacpp.dll b/llamacpp.dll index 5d8afda75..0430f18fa 100644 Binary files a/llamacpp.dll and b/llamacpp.dll differ