refined multiuser mode

This commit is contained in:
Concedo 2023-11-29 14:29:45 +08:00
parent b75152e3e9
commit 66ef4a20e2

View file

@ -776,7 +776,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
return return
reqblocking = False reqblocking = False
if args.multiuser and requestsinqueue < 4: #up to 5 concurrent requests muint = int(args.multiuser)
multiuserlimit = ((muint-1) if muint > 1 else 4)
#backwards compatibility for up to 5 concurrent requests, use default limit of 5 if multiuser set to 1
if muint > 0 and requestsinqueue < multiuserlimit:
reqblocking = True reqblocking = True
requestsinqueue += 1 requestsinqueue += 1
if not modelbusy.acquire(blocking=reqblocking): if not modelbusy.acquire(blocking=reqblocking):
@ -1539,7 +1542,7 @@ def show_new_gui():
args.port_param = defaultport if port_var.get()=="" else int(port_var.get()) args.port_param = defaultport if port_var.get()=="" else int(port_var.get())
args.host = host_var.get() args.host = host_var.get()
args.multiuser = multiuser_var.get() == 1 args.multiuser = multiuser_var.get()
if horde_apikey_var.get()=="" or horde_workername_var.get()=="": if horde_apikey_var.get()=="" or horde_workername_var.get()=="":
args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()] args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()]
@ -1636,7 +1639,8 @@ def show_new_gui():
if "host" in dict and dict["host"]: if "host" in dict and dict["host"]:
host_var.set(dict["host"]) host_var.set(dict["host"])
multiuser_var.set(1 if "multiuser" in dict and dict["multiuser"] else 0) if "multiuser" in dict:
multiuser_var.set(dict["multiuser"])
if "hordeconfig" in dict and dict["hordeconfig"] and len(dict["hordeconfig"]) > 1: if "hordeconfig" in dict and dict["hordeconfig"] and len(dict["hordeconfig"]) > 1:
horde_name_var.set(dict["hordeconfig"][0]) horde_name_var.set(dict["hordeconfig"][0])
@ -2174,8 +2178,8 @@ def main(launch_args,start_server=True):
epurl = f"http://localhost:{args.port}" epurl = f"http://localhost:{args.port}"
else: else:
epurl = f"http://{args.host}:{args.port}" epurl = f"http://{args.host}:{args.port}"
print(f"Starting Kobold HTTP Server on port {args.port} at {epurl}/api/") print(f"Starting Kobold API on port {args.port} at {epurl}/api/")
print(f"Starting OpenAI Compatible Endpoint on port {args.port} at {epurl}/v1/") print(f"Starting OpenAI Compatible API on port {args.port} at {epurl}/v1/")
if args.launch: if args.launch:
try: try:
@ -2201,7 +2205,7 @@ def main(launch_args,start_server=True):
if start_server: if start_server:
if args.remotetunnel: if args.remotetunnel:
setuptunnel() setuptunnel()
print(f"Please connect to custom endpoint at {epurl}") print(f"======\nPlease connect to custom endpoint at {epurl}")
asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite, embedded_kcpp_docs)) asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite, embedded_kcpp_docs))
else: else:
print(f"Server was not started, main function complete. Idling.") print(f"Server was not started, main function complete. Idling.")
@ -2246,8 +2250,8 @@ if __name__ == '__main__':
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq']) compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq'])
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", type=str, default="",nargs=1) parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true') parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=0)
parser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true') parser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true')
parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true') parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
parser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as Kobold Lite) can access over the API.", default="") parser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as Kobold Lite) can access over the API.", default="")