allow blasbatchsize -1 which disables blas, but keeps benefits like gpu offloads.
This commit is contained in:
parent
49272e3c53
commit
37659d2c4e
2 changed files with 3 additions and 3 deletions
|
@ -791,7 +791,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
file_format == FileFormat::GPTJ_2 ||
|
file_format == FileFormat::GPTJ_2 ||
|
||||||
file_format == FileFormat::RWKV_1 ||
|
file_format == FileFormat::RWKV_1 ||
|
||||||
file_format==FileFormat::RWKV_2);
|
file_format==FileFormat::RWKV_2);
|
||||||
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize!=-1);
|
||||||
// bool blasmode = false;
|
// bool blasmode = false;
|
||||||
int original_batch = params.n_batch;
|
int original_batch = params.n_batch;
|
||||||
int original_threads = params.n_threads;
|
int original_threads = params.n_threads;
|
||||||
|
|
|
@ -210,7 +210,7 @@ maxctx = 2048
|
||||||
maxlen = 256
|
maxlen = 256
|
||||||
modelbusy = False
|
modelbusy = False
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.27"
|
KcppVersion = "1.28"
|
||||||
|
|
||||||
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
sys_version = ""
|
sys_version = ""
|
||||||
|
@ -648,7 +648,7 @@ if __name__ == '__main__':
|
||||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||||
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
||||||
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,4096,8192], default=2048)
|
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,4096,8192], default=2048)
|
||||||
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[32,64,128,256,512,1024], default=512)
|
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
|
||||||
parser.add_argument("--stream", help="Uses pseudo streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
|
parser.add_argument("--stream", help="Uses pseudo streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
|
||||||
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
||||||
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
|
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue