diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index e755043b3..a2bbc8777 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -883,7 +883,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { llama_model_params model_params = llama_model_default_params(); llama_context_params llama_ctx_params = llama_context_default_params(); - llama_ctx_params.n_ctx = clamped_max_context_length; + llama_ctx_params.n_ctx = clamped_max_context_length + 64; //add some extra context to deal with KV fragmentation //llama_ctx_paran_parts = -1; llama_ctx_params.seed = -1; llama_ctx_params.f16_kv = inputs.f16_kv; @@ -1808,7 +1808,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (!evalres) { - fprintf(stderr, "\nFailed to predict! Check your context buffer sizes!\n"); + fprintf(stderr, "\nFailed to predict at %d! Check your context buffer sizes!\n",n_past); snprintf(output.text, sizeof(output.text), "%s", ""); output.status = 0; generation_finished = true; diff --git a/koboldcpp.py b/koboldcpp.py index 955a8bbde..3ef413050 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -2164,13 +2164,14 @@ def main(launch_args,start_server=True): if args.port_param!=defaultport: args.port = args.port_param - print(f"Starting Kobold HTTP Server on port {args.port} at http://localhost:{args.port}/api") - print(f"Starting OpenAI Compatible Endpoint on port {args.port} at http://localhost:{args.port}/v1") + epurl = "" if args.host=="": epurl = f"http://localhost:{args.port}" else: epurl = f"http://{args.host}:{args.port}" + print(f"Starting Kobold HTTP Server on port {args.port} at {epurl}/api/") + print(f"Starting OpenAI Compatible Endpoint on port {args.port} at {epurl}/v1/") if args.launch: try: