From 37659d2c4e38adebf3811d0997c8a79e9719edff Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 1 Jun 2023 22:33:50 +0800 Subject: [PATCH] allow blasbatchsize -1 which disables blas, but keeps benefits like gpu offloads. --- gpttype_adapter.cpp | 2 +- koboldcpp.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 86c13ce25..483a1f957 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -791,7 +791,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o file_format == FileFormat::GPTJ_2 || file_format == FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2); - bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas()); + bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize!=-1); // bool blasmode = false; int original_batch = params.n_batch; int original_threads = params.n_threads; diff --git a/koboldcpp.py b/koboldcpp.py index a34702bfc..f379b46a9 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -210,7 +210,7 @@ maxctx = 2048 maxlen = 256 modelbusy = False defaultport = 5001 -KcppVersion = "1.27" +KcppVersion = "1.28" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" @@ -648,7 +648,7 @@ if __name__ == '__main__': parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,4096,8192], default=2048) - parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[32,64,128,256,512,1024], default=512) + parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512) parser.add_argument("--stream", help="Uses pseudo streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')