From ad5676810ae3e4ad4f20423ba186d34445ea4fd9 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 16 Apr 2023 01:17:40 +0800 Subject: [PATCH] merge CLBlast improvements - GPU dequant --- expose.h | 1 + gpttype_adapter.cpp | 4 +++- koboldcpp.py | 9 ++++++--- llama_adapter.cpp | 4 +++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/expose.h b/expose.h index 66dfde66a..67f1e1937 100644 --- a/expose.h +++ b/expose.h @@ -11,6 +11,7 @@ struct load_model_inputs const bool use_mmap; const bool use_smartcontext; const int clblast_info = 0; + const int blasbatchsize = 512; }; struct generation_inputs { diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 83974e761..11f15ead7 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -30,6 +30,7 @@ static int n_past = 0; static int n_threads = 4; static int n_batch = 8; static bool useSmartContext = false; +static int blasbatchsize = 512; static std::string modelname; static std::vector last_n_tokens; static std::vector current_context_tokens; @@ -53,6 +54,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in n_batch = params.n_batch = inputs.batch_size; modelname = params.model = inputs.model_filename; useSmartContext = inputs.use_smartcontext; + blasbatchsize = inputs.blasbatchsize; params.memory_f16 = inputs.f16_kv; params.n_ctx = inputs.max_context_length; model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx; @@ -208,7 +210,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o int original_threads = params.n_threads; if (blasmode) { - params.n_batch = 512; //received reports of 1024 and above crashing on some models + params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models params.n_threads = 1; } diff --git a/koboldcpp.py b/koboldcpp.py index e03e96c30..783303e35 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -17,7 +17,8 @@ class load_model_inputs(ctypes.Structure): ("n_parts_overwrite", ctypes.c_int), ("use_mmap", ctypes.c_bool), ("use_smartcontext", ctypes.c_bool), - ("clblast_info", ctypes.c_int)] + ("clblast_info", ctypes.c_int), + ("blasbatchsize", ctypes.c_int)] class generation_inputs(ctypes.Structure): _fields_ = [("seed", ctypes.c_int), @@ -66,7 +67,7 @@ def init_library(): handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever handle.generate.restype = generation_outputs -def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6,use_mmap=False,use_smartcontext=False): +def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6,use_mmap=False,use_smartcontext=False,blasbatchsize=512): inputs = load_model_inputs() inputs.model_filename = model_filename.encode("UTF-8") inputs.batch_size = batch_size @@ -76,6 +77,7 @@ def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwr inputs.f16_kv = True inputs.use_mmap = use_mmap inputs.use_smartcontext = use_smartcontext + inputs.blasbatchsize = blasbatchsize clblastids = 0 if args.useclblast: clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1]) @@ -388,7 +390,7 @@ def main(args): mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1 modelname = os.path.abspath(ggml_selected_file) print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}, SmartContext: {args.smartcontext}]") - loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap),args.smartcontext) + loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap),args.smartcontext,args.blasbatchsize) print("Load Model OK: " + str(loadok)) if not loadok: @@ -435,6 +437,7 @@ if __name__ == '__main__': default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1)) parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads) parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') + parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512) parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') diff --git a/llama_adapter.cpp b/llama_adapter.cpp index ba14fe250..fde59f5b3 100644 --- a/llama_adapter.cpp +++ b/llama_adapter.cpp @@ -28,6 +28,7 @@ static int n_past = 0; static int n_threads = 4; static int n_batch = 8; static bool useSmartContext = false; +static int blasbatchsize = 512; static std::string modelname; static llama_context *ctx; static std::vector last_n_tokens; @@ -44,6 +45,7 @@ bool llama_load_model(const load_model_inputs inputs, FileFormat in_file_format) n_batch = inputs.batch_size; modelname = inputs.model_filename; useSmartContext = inputs.use_smartcontext; + blasbatchsize = inputs.blasbatchsize; ctx_params.n_ctx = inputs.max_context_length; ctx_params.n_parts = -1;//inputs.n_parts_overwrite; @@ -143,7 +145,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out int original_threads = params.n_threads; if (blasmode) { - params.n_batch = 512; //received reports of 1024 and above crashing on some models + params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models params.n_threads = 1; }