remove dependency of psutil, fixed compile error on WSL, handle exceptions when sending http response, added multiline for embedded kobold
This commit is contained in:
parent
5c1920df43
commit
3d650d0e25
6 changed files with 29 additions and 16 deletions
File diff suppressed because one or more lines are too long
BIN
koboldcpp.dll
BIN
koboldcpp.dll
Binary file not shown.
23
koboldcpp.py
23
koboldcpp.py
|
@ -5,7 +5,6 @@
|
|||
|
||||
import ctypes
|
||||
import os
|
||||
import psutil
|
||||
import argparse
|
||||
import json, http.server, threading, socket, sys, time
|
||||
|
||||
|
@ -122,8 +121,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
if self.path.endswith(('/api/v1/model', '/api/latest/model')):
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
result = {'result': friendlymodelname }
|
||||
self.wfile.write(json.dumps(result).encode())
|
||||
result = {'result': friendlymodelname }
|
||||
self.wfile.write(json.dumps(result).encode())
|
||||
return
|
||||
|
||||
if self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
|
||||
|
@ -191,6 +190,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
newprompt = fullprompt
|
||||
|
||||
recvtxt = ""
|
||||
res = {}
|
||||
if kai_api_flag:
|
||||
recvtxt = generate(
|
||||
prompt=newprompt,
|
||||
|
@ -204,10 +204,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
seed=-1
|
||||
)
|
||||
print("\nOutput: " + recvtxt)
|
||||
res = {"results": [{"text": recvtxt}]}
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(res).encode())
|
||||
res = {"results": [{"text": recvtxt}]}
|
||||
else:
|
||||
recvtxt = generate(
|
||||
prompt=newprompt,
|
||||
|
@ -221,9 +218,13 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
)
|
||||
print("\nOutput: " + recvtxt)
|
||||
res = {"data": {"seqs":[recvtxt]}}
|
||||
|
||||
try:
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(res).encode())
|
||||
except:
|
||||
print("Generate: The response could not be sent, maybe connection was terminated?")
|
||||
modelbusy = False
|
||||
return
|
||||
self.send_response(404)
|
||||
|
@ -278,7 +279,7 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
|
|||
def stop(self):
|
||||
self.httpd.server_close()
|
||||
|
||||
numThreads = 5
|
||||
numThreads = 6
|
||||
threadArr = []
|
||||
for i in range(numThreads):
|
||||
threadArr.append(Thread(i))
|
||||
|
@ -356,8 +357,10 @@ if __name__ == '__main__':
|
|||
portgroup.add_argument("port", help="Port to listen on", default=5001, nargs="?", type=int)
|
||||
parser.add_argument("--host", help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="")
|
||||
|
||||
physical_core_limit = psutil.cpu_count(logical=False)
|
||||
# logical_core_limit = (os.cpu_count() if os.cpu_count()<=4 else max(4,os.cpu_count()-4))
|
||||
# psutil.cpu_count(logical=False)
|
||||
physical_core_limit = 1
|
||||
if os.cpu_count()!=None and os.cpu_count()>1:
|
||||
physical_core_limit = int(os.cpu_count()/2)
|
||||
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
||||
|
|
Binary file not shown.
|
@ -131,7 +131,8 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
|||
|
||||
//fast forward the past based on identical tokens, stop once a divergence is noted
|
||||
int embd_inp_len = embd_inp.size();
|
||||
for (int i = 0; i < current_context_tokens.size(); ++i)
|
||||
int ctxcs = current_context_tokens.size();
|
||||
for (int i = 0; i < ctxcs; ++i)
|
||||
{
|
||||
if (current_context_tokens[i] == embd_inp[i])
|
||||
{
|
||||
|
@ -203,7 +204,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
|||
|
||||
n_past += embd.size();
|
||||
embd.clear();
|
||||
if ((int)embd_inp.size() <= input_consumed)
|
||||
if ((int)embd_inp_size <= input_consumed)
|
||||
{
|
||||
// out of user input, sample next token
|
||||
const float top_k = params.top_k;
|
||||
|
@ -247,7 +248,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
|||
else
|
||||
{
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
while ((int)embd_inp.size() > input_consumed)
|
||||
while ((int)embd_inp_size > input_consumed)
|
||||
{
|
||||
embd.push_back(embd_inp[input_consumed]);
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
|
@ -262,7 +263,9 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
|||
}
|
||||
}
|
||||
time2 = timer_check();
|
||||
printf("\nTime Taken - Processing:%.1fs, Generation:%.1fs, Total:%.1fs", time1, time2, (time1 + time2));
|
||||
float pt1 = (time1*1000.0/(embd_inp_size==0?1:embd_inp_size));
|
||||
float pt2 = (time2*1000.0/(params.n_predict==0?1:params.n_predict));
|
||||
printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs", time1, pt1, time2, pt2, (time1 + time2));
|
||||
fflush(stdout);
|
||||
output.status = 1;
|
||||
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
// Defines CLOCK_MONOTONIC and asprintf on Linux
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include "ggml_v1.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue