From 66ef4a20e2bd5c4f3d8872a531c9043aecae8021 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:29:45 +0800 Subject: [PATCH] refined multiuser mode --- koboldcpp.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 3dbc759a2..31fa54a6d 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -776,7 +776,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): return reqblocking = False - if args.multiuser and requestsinqueue < 4: #up to 5 concurrent requests + muint = int(args.multiuser) + multiuserlimit = ((muint-1) if muint > 1 else 4) + #backwards compatibility for up to 5 concurrent requests, use default limit of 5 if multiuser set to 1 + if muint > 0 and requestsinqueue < multiuserlimit: reqblocking = True requestsinqueue += 1 if not modelbusy.acquire(blocking=reqblocking): @@ -1539,7 +1542,7 @@ def show_new_gui(): args.port_param = defaultport if port_var.get()=="" else int(port_var.get()) args.host = host_var.get() - args.multiuser = multiuser_var.get() == 1 + args.multiuser = multiuser_var.get() if horde_apikey_var.get()=="" or horde_workername_var.get()=="": args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()] @@ -1636,7 +1639,8 @@ def show_new_gui(): if "host" in dict and dict["host"]: host_var.set(dict["host"]) - multiuser_var.set(1 if "multiuser" in dict and dict["multiuser"] else 0) + if "multiuser" in dict: + multiuser_var.set(dict["multiuser"]) if "hordeconfig" in dict and dict["hordeconfig"] and len(dict["hordeconfig"]) > 1: horde_name_var.set(dict["hordeconfig"][0]) @@ -2174,8 +2178,8 @@ def main(launch_args,start_server=True): epurl = f"http://localhost:{args.port}" else: epurl = f"http://{args.host}:{args.port}" - print(f"Starting Kobold HTTP Server on port {args.port} at {epurl}/api/") - print(f"Starting OpenAI Compatible Endpoint on port {args.port} at {epurl}/v1/") + print(f"Starting Kobold API on port {args.port} at {epurl}/api/") + print(f"Starting OpenAI Compatible API on port {args.port} at {epurl}/v1/") if args.launch: try: @@ -2201,7 +2205,7 @@ def main(launch_args,start_server=True): if start_server: if args.remotetunnel: setuptunnel() - print(f"Please connect to custom endpoint at {epurl}") + print(f"======\nPlease connect to custom endpoint at {epurl}") asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite, embedded_kcpp_docs)) else: print(f"Server was not started, main function complete. Idling.") @@ -2246,8 +2250,8 @@ if __name__ == '__main__': compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq']) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') - parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", type=str, default="",nargs=1) - parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true') + parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1) + parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=0) parser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true') parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true') parser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as Kobold Lite) can access over the API.", default="")