added multiuser mode
This commit is contained in:
parent
4218641d97
commit
53885de6db
2 changed files with 38 additions and 19 deletions
2
class.py
2
class.py
|
@ -37,7 +37,7 @@ class model_backend(InferenceModel):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def is_valid(self, model_name, model_path, menu_path):
|
def is_valid(self, model_name, model_path, menu_path):
|
||||||
return "ggml" in model_name.lower()
|
return ("ggml" in model_name.lower() or "gguf" in model_name.lower())
|
||||||
|
|
||||||
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||||
self.filename = model_name #model_path is null, name is path for some reason
|
self.filename = model_name #model_path is null, name is path for some reason
|
||||||
|
|
41
koboldcpp.py
41
koboldcpp.py
|
@ -350,6 +350,7 @@ maxctx = 2048
|
||||||
maxhordectx = 1024
|
maxhordectx = 1024
|
||||||
maxhordelen = 256
|
maxhordelen = 256
|
||||||
modelbusy = threading.Lock()
|
modelbusy = threading.Lock()
|
||||||
|
requestsinqueue = 0
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.44"
|
KcppVersion = "1.44"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
|
@ -565,7 +566,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_POST(self):
|
def do_POST(self):
|
||||||
global modelbusy
|
global modelbusy, requestsinqueue
|
||||||
content_length = int(self.headers['Content-Length'])
|
content_length = int(self.headers['Content-Length'])
|
||||||
body = self.rfile.read(content_length)
|
body = self.rfile.read(content_length)
|
||||||
basic_api_flag = False
|
basic_api_flag = False
|
||||||
|
@ -590,14 +591,19 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.path.endswith('/api/extra/abort'):
|
if self.path.endswith('/api/extra/abort'):
|
||||||
|
if requestsinqueue==0:
|
||||||
ag = handle.abort_generate()
|
ag = handle.abort_generate()
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
||||||
print("\nGeneration Aborted")
|
print("\nGeneration Aborted")
|
||||||
|
else:
|
||||||
|
self.wfile.write(json.dumps({"success": "false"}).encode())
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.path.endswith('/api/extra/generate/check'):
|
if self.path.endswith('/api/extra/generate/check'):
|
||||||
|
pendtxtStr = ""
|
||||||
|
if requestsinqueue==0:
|
||||||
pendtxt = handle.get_pending_output()
|
pendtxt = handle.get_pending_output()
|
||||||
pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
|
pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
|
@ -605,7 +611,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
|
self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
|
||||||
return
|
return
|
||||||
|
|
||||||
if not modelbusy.acquire(blocking=False):
|
reqblocking = False
|
||||||
|
if args.multiuser and requestsinqueue < 4: #up to 5 concurrent requests
|
||||||
|
reqblocking = True
|
||||||
|
requestsinqueue += 1
|
||||||
|
if not modelbusy.acquire(blocking=reqblocking):
|
||||||
self.send_response(503)
|
self.send_response(503)
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write(json.dumps({"detail": {
|
self.wfile.write(json.dumps({"detail": {
|
||||||
|
@ -613,6 +623,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
"type": "service_unavailable",
|
"type": "service_unavailable",
|
||||||
}}).encode())
|
}}).encode())
|
||||||
return
|
return
|
||||||
|
if reqblocking:
|
||||||
|
requestsinqueue = (requestsinqueue - 1) if requestsinqueue>0 else 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.path.endswith('/request'):
|
if self.path.endswith('/request'):
|
||||||
|
@ -717,7 +729,7 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
|
||||||
exitcounter = 999
|
exitcounter = 999
|
||||||
self.httpd.server_close()
|
self.httpd.server_close()
|
||||||
|
|
||||||
numThreads = 8
|
numThreads = 10
|
||||||
threadArr = []
|
threadArr = []
|
||||||
for i in range(numThreads):
|
for i in range(numThreads):
|
||||||
threadArr.append(Thread(i))
|
threadArr.append(Thread(i))
|
||||||
|
@ -919,6 +931,7 @@ def show_new_gui():
|
||||||
|
|
||||||
port_var = ctk.StringVar(value=defaultport)
|
port_var = ctk.StringVar(value=defaultport)
|
||||||
host_var = ctk.StringVar(value="")
|
host_var = ctk.StringVar(value="")
|
||||||
|
multiuser_var = ctk.IntVar()
|
||||||
horde_name_var = ctk.StringVar(value="koboldcpp")
|
horde_name_var = ctk.StringVar(value="koboldcpp")
|
||||||
horde_gen_var = ctk.StringVar(value=maxhordelen)
|
horde_gen_var = ctk.StringVar(value=maxhordelen)
|
||||||
horde_context_var = ctk.StringVar(value=maxhordectx)
|
horde_context_var = ctk.StringVar(value=maxhordectx)
|
||||||
|
@ -1098,14 +1111,16 @@ def show_new_gui():
|
||||||
makelabelentry(network_tab, "Port: ", port_var, 1, 150)
|
makelabelentry(network_tab, "Port: ", port_var, 1, 150)
|
||||||
makelabelentry(network_tab, "Host: ", host_var, 2, 150)
|
makelabelentry(network_tab, "Host: ", host_var, 2, 150)
|
||||||
|
|
||||||
# horde
|
makecheckbox(network_tab, "Multiuser Mode", multiuser_var, 3)
|
||||||
makelabel(network_tab, "Horde:", 3).grid(pady=10)
|
|
||||||
|
|
||||||
horde_name_entry, horde_name_label = makelabelentry(network_tab, "Horde Model Name:", horde_name_var, 5, 180)
|
# horde
|
||||||
horde_gen_entry, horde_gen_label = makelabelentry(network_tab, "Gen. Length:", horde_gen_var, 6, 50)
|
makelabel(network_tab, "Horde:", 5).grid(pady=10)
|
||||||
horde_context_entry, horde_context_label = makelabelentry(network_tab, "Max Context:",horde_context_var, 7, 50)
|
|
||||||
horde_apikey_entry, horde_apikey_label = makelabelentry(network_tab, "API Key (If Embedded Worker):",horde_apikey_var, 8, 180)
|
horde_name_entry, horde_name_label = makelabelentry(network_tab, "Horde Model Name:", horde_name_var, 7, 180)
|
||||||
horde_workername_entry, horde_workername_label = makelabelentry(network_tab, "Horde Worker Name:",horde_workername_var, 9, 180)
|
horde_gen_entry, horde_gen_label = makelabelentry(network_tab, "Gen. Length:", horde_gen_var, 8, 50)
|
||||||
|
horde_context_entry, horde_context_label = makelabelentry(network_tab, "Max Context:",horde_context_var, 9, 50)
|
||||||
|
horde_apikey_entry, horde_apikey_label = makelabelentry(network_tab, "API Key (If Embedded Worker):",horde_apikey_var, 10, 180)
|
||||||
|
horde_workername_entry, horde_workername_label = makelabelentry(network_tab, "Horde Worker Name:",horde_workername_var, 11, 180)
|
||||||
|
|
||||||
def togglehorde(a,b,c):
|
def togglehorde(a,b,c):
|
||||||
labels = [horde_name_label, horde_gen_label, horde_context_label, horde_apikey_label, horde_workername_label]
|
labels = [horde_name_label, horde_gen_label, horde_context_label, horde_apikey_label, horde_workername_label]
|
||||||
|
@ -1120,7 +1135,7 @@ def show_new_gui():
|
||||||
basefile = os.path.basename(model_var.get())
|
basefile = os.path.basename(model_var.get())
|
||||||
horde_name_var.set(os.path.splitext(basefile)[0])
|
horde_name_var.set(os.path.splitext(basefile)[0])
|
||||||
|
|
||||||
makecheckbox(network_tab, "Configure for Horde", usehorde_var, 4, command=togglehorde)
|
makecheckbox(network_tab, "Configure for Horde", usehorde_var, 6, command=togglehorde)
|
||||||
togglehorde(1,1,1)
|
togglehorde(1,1,1)
|
||||||
|
|
||||||
# launch
|
# launch
|
||||||
|
@ -1191,6 +1206,7 @@ def show_new_gui():
|
||||||
|
|
||||||
args.port_param = defaultport if port_var.get()=="" else int(port_var.get())
|
args.port_param = defaultport if port_var.get()=="" else int(port_var.get())
|
||||||
args.host = host_var.get()
|
args.host = host_var.get()
|
||||||
|
args.multiuser = multiuser_var.get() == 1
|
||||||
|
|
||||||
if horde_apikey_var.get()=="" or horde_workername_var.get()=="":
|
if horde_apikey_var.get()=="" or horde_workername_var.get()=="":
|
||||||
args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()]
|
args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()]
|
||||||
|
@ -1280,6 +1296,8 @@ def show_new_gui():
|
||||||
if "host" in dict and dict["host"]:
|
if "host" in dict and dict["host"]:
|
||||||
host_var.set(dict["host"])
|
host_var.set(dict["host"])
|
||||||
|
|
||||||
|
multiuser_var.set(1 if "multiuser" in dict and dict["multiuser"] else 0)
|
||||||
|
|
||||||
if "hordeconfig" in dict and dict["hordeconfig"] and len(dict["hordeconfig"]) > 1:
|
if "hordeconfig" in dict and dict["hordeconfig"] and len(dict["hordeconfig"]) > 1:
|
||||||
horde_name_var.set(dict["hordeconfig"][0])
|
horde_name_var.set(dict["hordeconfig"][0])
|
||||||
horde_gen_var.set(dict["hordeconfig"][1])
|
horde_gen_var.set(dict["hordeconfig"][1])
|
||||||
|
@ -1841,5 +1859,6 @@ if __name__ == '__main__':
|
||||||
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
|
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
|
||||||
parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
|
parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
|
||||||
parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", type=str, default="",nargs=1)
|
parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", type=str, default="",nargs=1)
|
||||||
|
parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them. Polled-streaming is disabled while multiple requests are in queue.", action='store_true')
|
||||||
|
|
||||||
main(parser.parse_args(),start_server=True)
|
main(parser.parse_args(),start_server=True)
|
Loading…
Add table
Add a link
Reference in a new issue