merge CLBlast improvements - GPU dequant
This commit is contained in:
parent
3e992eabb4
commit
ad5676810a
4 changed files with 13 additions and 5 deletions
1
expose.h
1
expose.h
|
@ -11,6 +11,7 @@ struct load_model_inputs
|
||||||
const bool use_mmap;
|
const bool use_mmap;
|
||||||
const bool use_smartcontext;
|
const bool use_smartcontext;
|
||||||
const int clblast_info = 0;
|
const int clblast_info = 0;
|
||||||
|
const int blasbatchsize = 512;
|
||||||
};
|
};
|
||||||
struct generation_inputs
|
struct generation_inputs
|
||||||
{
|
{
|
||||||
|
|
|
@ -30,6 +30,7 @@ static int n_past = 0;
|
||||||
static int n_threads = 4;
|
static int n_threads = 4;
|
||||||
static int n_batch = 8;
|
static int n_batch = 8;
|
||||||
static bool useSmartContext = false;
|
static bool useSmartContext = false;
|
||||||
|
static int blasbatchsize = 512;
|
||||||
static std::string modelname;
|
static std::string modelname;
|
||||||
static std::vector<gpt_vocab::id> last_n_tokens;
|
static std::vector<gpt_vocab::id> last_n_tokens;
|
||||||
static std::vector<gpt_vocab::id> current_context_tokens;
|
static std::vector<gpt_vocab::id> current_context_tokens;
|
||||||
|
@ -53,6 +54,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
n_batch = params.n_batch = inputs.batch_size;
|
n_batch = params.n_batch = inputs.batch_size;
|
||||||
modelname = params.model = inputs.model_filename;
|
modelname = params.model = inputs.model_filename;
|
||||||
useSmartContext = inputs.use_smartcontext;
|
useSmartContext = inputs.use_smartcontext;
|
||||||
|
blasbatchsize = inputs.blasbatchsize;
|
||||||
params.memory_f16 = inputs.f16_kv;
|
params.memory_f16 = inputs.f16_kv;
|
||||||
params.n_ctx = inputs.max_context_length;
|
params.n_ctx = inputs.max_context_length;
|
||||||
model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx;
|
model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx;
|
||||||
|
@ -208,7 +210,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
int original_threads = params.n_threads;
|
int original_threads = params.n_threads;
|
||||||
if (blasmode)
|
if (blasmode)
|
||||||
{
|
{
|
||||||
params.n_batch = 512; //received reports of 1024 and above crashing on some models
|
params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models
|
||||||
params.n_threads = 1;
|
params.n_threads = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,8 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("n_parts_overwrite", ctypes.c_int),
|
("n_parts_overwrite", ctypes.c_int),
|
||||||
("use_mmap", ctypes.c_bool),
|
("use_mmap", ctypes.c_bool),
|
||||||
("use_smartcontext", ctypes.c_bool),
|
("use_smartcontext", ctypes.c_bool),
|
||||||
("clblast_info", ctypes.c_int)]
|
("clblast_info", ctypes.c_int),
|
||||||
|
("blasbatchsize", ctypes.c_int)]
|
||||||
|
|
||||||
class generation_inputs(ctypes.Structure):
|
class generation_inputs(ctypes.Structure):
|
||||||
_fields_ = [("seed", ctypes.c_int),
|
_fields_ = [("seed", ctypes.c_int),
|
||||||
|
@ -66,7 +67,7 @@ def init_library():
|
||||||
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
|
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
|
||||||
handle.generate.restype = generation_outputs
|
handle.generate.restype = generation_outputs
|
||||||
|
|
||||||
def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6,use_mmap=False,use_smartcontext=False):
|
def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6,use_mmap=False,use_smartcontext=False,blasbatchsize=512):
|
||||||
inputs = load_model_inputs()
|
inputs = load_model_inputs()
|
||||||
inputs.model_filename = model_filename.encode("UTF-8")
|
inputs.model_filename = model_filename.encode("UTF-8")
|
||||||
inputs.batch_size = batch_size
|
inputs.batch_size = batch_size
|
||||||
|
@ -76,6 +77,7 @@ def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwr
|
||||||
inputs.f16_kv = True
|
inputs.f16_kv = True
|
||||||
inputs.use_mmap = use_mmap
|
inputs.use_mmap = use_mmap
|
||||||
inputs.use_smartcontext = use_smartcontext
|
inputs.use_smartcontext = use_smartcontext
|
||||||
|
inputs.blasbatchsize = blasbatchsize
|
||||||
clblastids = 0
|
clblastids = 0
|
||||||
if args.useclblast:
|
if args.useclblast:
|
||||||
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
|
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
|
||||||
|
@ -388,7 +390,7 @@ def main(args):
|
||||||
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
|
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
|
||||||
modelname = os.path.abspath(ggml_selected_file)
|
modelname = os.path.abspath(ggml_selected_file)
|
||||||
print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}, SmartContext: {args.smartcontext}]")
|
print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}, SmartContext: {args.smartcontext}]")
|
||||||
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap),args.smartcontext)
|
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap),args.smartcontext,args.blasbatchsize)
|
||||||
print("Load Model OK: " + str(loadok))
|
print("Load Model OK: " + str(loadok))
|
||||||
|
|
||||||
if not loadok:
|
if not loadok:
|
||||||
|
@ -435,6 +437,7 @@ if __name__ == '__main__':
|
||||||
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
||||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||||
|
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512)
|
||||||
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
||||||
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
||||||
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
||||||
|
|
|
@ -28,6 +28,7 @@ static int n_past = 0;
|
||||||
static int n_threads = 4;
|
static int n_threads = 4;
|
||||||
static int n_batch = 8;
|
static int n_batch = 8;
|
||||||
static bool useSmartContext = false;
|
static bool useSmartContext = false;
|
||||||
|
static int blasbatchsize = 512;
|
||||||
static std::string modelname;
|
static std::string modelname;
|
||||||
static llama_context *ctx;
|
static llama_context *ctx;
|
||||||
static std::vector<llama_token> last_n_tokens;
|
static std::vector<llama_token> last_n_tokens;
|
||||||
|
@ -44,6 +45,7 @@ bool llama_load_model(const load_model_inputs inputs, FileFormat in_file_format)
|
||||||
n_batch = inputs.batch_size;
|
n_batch = inputs.batch_size;
|
||||||
modelname = inputs.model_filename;
|
modelname = inputs.model_filename;
|
||||||
useSmartContext = inputs.use_smartcontext;
|
useSmartContext = inputs.use_smartcontext;
|
||||||
|
blasbatchsize = inputs.blasbatchsize;
|
||||||
|
|
||||||
ctx_params.n_ctx = inputs.max_context_length;
|
ctx_params.n_ctx = inputs.max_context_length;
|
||||||
ctx_params.n_parts = -1;//inputs.n_parts_overwrite;
|
ctx_params.n_parts = -1;//inputs.n_parts_overwrite;
|
||||||
|
@ -143,7 +145,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
||||||
int original_threads = params.n_threads;
|
int original_threads = params.n_threads;
|
||||||
if (blasmode)
|
if (blasmode)
|
||||||
{
|
{
|
||||||
params.n_batch = 512; //received reports of 1024 and above crashing on some models
|
params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models
|
||||||
params.n_threads = 1;
|
params.n_threads = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue