diff --git a/klite.embd b/klite.embd
index d2580a425..f7104f23a 100644
--- a/klite.embd
+++ b/klite.embd
@@ -26,8 +26,8 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please
-
+
@@ -489,6 +489,10 @@ Unsaved data will be lost.
Improved Prompt ?Modifies the context, injecting tokens to improve adventure quality for adventure mode.
diff --git a/koboldcpp.dll b/koboldcpp.dll
index b682d241c..25ba23ba7 100644
Binary files a/koboldcpp.dll and b/koboldcpp.dll differ
diff --git a/koboldcpp.py b/koboldcpp.py
index 60aad0819..2e0e678ea 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -5,7 +5,6 @@
import ctypes
import os
-import psutil
import argparse
import json, http.server, threading, socket, sys, time
@@ -122,8 +121,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
if self.path.endswith(('/api/v1/model', '/api/latest/model')):
self.send_response(200)
self.end_headers()
- result = {'result': friendlymodelname }
- self.wfile.write(json.dumps(result).encode())
+ result = {'result': friendlymodelname }
+ self.wfile.write(json.dumps(result).encode())
return
if self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
@@ -191,6 +190,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
newprompt = fullprompt
recvtxt = ""
+ res = {}
if kai_api_flag:
recvtxt = generate(
prompt=newprompt,
@@ -204,10 +204,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
seed=-1
)
print("\nOutput: " + recvtxt)
- res = {"results": [{"text": recvtxt}]}
- self.send_response(200)
- self.end_headers()
- self.wfile.write(json.dumps(res).encode())
+ res = {"results": [{"text": recvtxt}]}
else:
recvtxt = generate(
prompt=newprompt,
@@ -221,9 +218,13 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
)
print("\nOutput: " + recvtxt)
res = {"data": {"seqs":[recvtxt]}}
+
+ try:
self.send_response(200)
self.end_headers()
self.wfile.write(json.dumps(res).encode())
+ except:
+ print("Generate: The response could not be sent, maybe connection was terminated?")
modelbusy = False
return
self.send_response(404)
@@ -278,7 +279,7 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
def stop(self):
self.httpd.server_close()
- numThreads = 5
+ numThreads = 6
threadArr = []
for i in range(numThreads):
threadArr.append(Thread(i))
@@ -356,8 +357,10 @@ if __name__ == '__main__':
portgroup.add_argument("port", help="Port to listen on", default=5001, nargs="?", type=int)
parser.add_argument("--host", help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="")
- physical_core_limit = psutil.cpu_count(logical=False)
- # logical_core_limit = (os.cpu_count() if os.cpu_count()<=4 else max(4,os.cpu_count()-4))
+ # psutil.cpu_count(logical=False)
+ physical_core_limit = 1
+ if os.cpu_count()!=None and os.cpu_count()>1:
+ physical_core_limit = int(os.cpu_count()/2)
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
diff --git a/koboldcpp_blas.dll b/koboldcpp_blas.dll
index 0c1cfa481..f62ae8eac 100644
Binary files a/koboldcpp_blas.dll and b/koboldcpp_blas.dll differ
diff --git a/llama_adapter.cpp b/llama_adapter.cpp
index 13529cc20..8e3d77379 100644
--- a/llama_adapter.cpp
+++ b/llama_adapter.cpp
@@ -131,7 +131,8 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
//fast forward the past based on identical tokens, stop once a divergence is noted
int embd_inp_len = embd_inp.size();
- for (int i = 0; i < current_context_tokens.size(); ++i)
+ int ctxcs = current_context_tokens.size();
+ for (int i = 0; i < ctxcs; ++i)
{
if (current_context_tokens[i] == embd_inp[i])
{
@@ -203,7 +204,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
n_past += embd.size();
embd.clear();
- if ((int)embd_inp.size() <= input_consumed)
+ if ((int)embd_inp_size <= input_consumed)
{
// out of user input, sample next token
const float top_k = params.top_k;
@@ -247,7 +248,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
else
{
// some user input remains from prompt or interaction, forward it to processing
- while ((int)embd_inp.size() > input_consumed)
+ while ((int)embd_inp_size > input_consumed)
{
embd.push_back(embd_inp[input_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
@@ -262,7 +263,9 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
}
}
time2 = timer_check();
- printf("\nTime Taken - Processing:%.1fs, Generation:%.1fs, Total:%.1fs", time1, time2, (time1 + time2));
+ float pt1 = (time1*1000.0/(embd_inp_size==0?1:embd_inp_size));
+ float pt2 = (time2*1000.0/(params.n_predict==0?1:params.n_predict));
+ printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs", time1, pt1, time2, pt2, (time1 + time2));
fflush(stdout);
output.status = 1;
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
diff --git a/otherarch/ggml_v1.c b/otherarch/ggml_v1.c
index ee1de6241..4a96ab96c 100644
--- a/otherarch/ggml_v1.c
+++ b/otherarch/ggml_v1.c
@@ -1,3 +1,6 @@
+// Defines CLOCK_MONOTONIC and asprintf on Linux
+#define _GNU_SOURCE
+
#include "ggml_v1.h"
#if defined(_MSC_VER) || defined(__MINGW32__)