diff --git a/koboldcpp.py b/koboldcpp.py index 22de01fe4..75e5c8f8c 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -335,7 +335,7 @@ maxhordectx = 1024 maxhordelen = 256 modelbusy = threading.Lock() defaultport = 5001 -KcppVersion = "1.43" +KcppVersion = "1.44" showdebug = True showsamplerwarning = True showmaxctxwarning = True @@ -1757,6 +1757,15 @@ def main(launch_args,start_server=True): horde_thread.daemon = True horde_thread.start() + #if post-ready script specified, execute it + if args.onready: + def onready_subprocess(): + import subprocess + print("Starting Post-Load subprocess...") + subprocess.Popen(args.onready[0], shell=True) + timer_thread = threading.Timer(1, onready_subprocess) #1 second delay + timer_thread.start() + if start_server: print(f"Please connect to custom endpoint at {epurl}") asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite)) @@ -1808,5 +1817,6 @@ if __name__ == '__main__': compatgroup.add_argument("--usecublas", help="Use CuBLAS/hipBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq']) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') + parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", type=str, default="",nargs=1) main(parser.parse_args(),start_server=True) \ No newline at end of file