allow specifying a different thread count for GPU blas
This commit is contained in:
parent
89044502fe
commit
4857739ab5
3 changed files with 15 additions and 2 deletions
1
expose.h
1
expose.h
|
@ -4,6 +4,7 @@ const int stop_token_max = 10;
|
||||||
struct load_model_inputs
|
struct load_model_inputs
|
||||||
{
|
{
|
||||||
const int threads;
|
const int threads;
|
||||||
|
const int blasthreads;
|
||||||
const int max_context_length;
|
const int max_context_length;
|
||||||
const int batch_size;
|
const int batch_size;
|
||||||
const bool f16_kv;
|
const bool f16_kv;
|
||||||
|
|
|
@ -41,6 +41,7 @@ static llama_context * llama_ctx_v1;
|
||||||
static gpt_params params;
|
static gpt_params params;
|
||||||
static int n_past = 0;
|
static int n_past = 0;
|
||||||
static int n_threads = 4;
|
static int n_threads = 4;
|
||||||
|
static int n_blasthreads = 4;
|
||||||
static int n_batch = 8;
|
static int n_batch = 8;
|
||||||
static bool useSmartContext = false;
|
static bool useSmartContext = false;
|
||||||
static bool unbanTokens = false;
|
static bool unbanTokens = false;
|
||||||
|
@ -137,6 +138,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
|
|
||||||
file_format = in_file_format;
|
file_format = in_file_format;
|
||||||
n_threads = params.n_threads = inputs.threads;
|
n_threads = params.n_threads = inputs.threads;
|
||||||
|
n_blasthreads = inputs.blasthreads;
|
||||||
n_batch = params.n_batch = inputs.batch_size;
|
n_batch = params.n_batch = inputs.batch_size;
|
||||||
modelname = params.model = inputs.model_filename;
|
modelname = params.model = inputs.model_filename;
|
||||||
useSmartContext = inputs.use_smartcontext;
|
useSmartContext = inputs.use_smartcontext;
|
||||||
|
@ -460,6 +462,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
params.n_threads = 1; //do not limit here anymore.
|
params.n_threads = 1; //do not limit here anymore.
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
params.n_threads = n_blasthreads;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
current_context_tokens.resize(n_past);
|
current_context_tokens.resize(n_past);
|
||||||
|
|
10
koboldcpp.py
10
koboldcpp.py
|
@ -11,6 +11,7 @@ stop_token_max = 10
|
||||||
|
|
||||||
class load_model_inputs(ctypes.Structure):
|
class load_model_inputs(ctypes.Structure):
|
||||||
_fields_ = [("threads", ctypes.c_int),
|
_fields_ = [("threads", ctypes.c_int),
|
||||||
|
("blasthreads", ctypes.c_int),
|
||||||
("max_context_length", ctypes.c_int),
|
("max_context_length", ctypes.c_int),
|
||||||
("batch_size", ctypes.c_int),
|
("batch_size", ctypes.c_int),
|
||||||
("f16_kv", ctypes.c_bool),
|
("f16_kv", ctypes.c_bool),
|
||||||
|
@ -133,6 +134,7 @@ def load_model(model_filename):
|
||||||
inputs.batch_size = 8
|
inputs.batch_size = 8
|
||||||
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
|
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
|
||||||
inputs.threads = args.threads
|
inputs.threads = args.threads
|
||||||
|
inputs.blasthreads = args.blasthreads
|
||||||
inputs.f16_kv = True
|
inputs.f16_kv = True
|
||||||
inputs.use_mmap = (not args.nommap)
|
inputs.use_mmap = (not args.nommap)
|
||||||
inputs.use_mlock = args.usemlock
|
inputs.use_mlock = args.usemlock
|
||||||
|
@ -183,7 +185,7 @@ maxctx = 2048
|
||||||
maxlen = 128
|
maxlen = 128
|
||||||
modelbusy = False
|
modelbusy = False
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.18"
|
KcppVersion = "1.19"
|
||||||
|
|
||||||
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
sys_version = ""
|
sys_version = ""
|
||||||
|
@ -549,8 +551,11 @@ def main(args):
|
||||||
args.threads = psutil.cpu_count(logical=False)
|
args.threads = psutil.cpu_count(logical=False)
|
||||||
print("Overriding thread count, using " + str(args.threads) + " threads instead.")
|
print("Overriding thread count, using " + str(args.threads) + " threads instead.")
|
||||||
|
|
||||||
|
if not args.blasthreads or args.blasthreads <= 0:
|
||||||
|
args.blasthreads = args.threads
|
||||||
|
|
||||||
modelname = os.path.abspath(args.model_param)
|
modelname = os.path.abspath(args.model_param)
|
||||||
print(f"Loading model: {modelname} \n[Threads: {args.threads}, SmartContext: {args.smartcontext}]")
|
print(f"Loading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}]")
|
||||||
loadok = load_model(modelname)
|
loadok = load_model(modelname)
|
||||||
print("Load Model OK: " + str(loadok))
|
print("Load Model OK: " + str(loadok))
|
||||||
|
|
||||||
|
@ -604,6 +609,7 @@ if __name__ == '__main__':
|
||||||
physical_core_limit = int(os.cpu_count()/2)
|
physical_core_limit = int(os.cpu_count()/2)
|
||||||
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
||||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||||
|
parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads", type=int, default=0)
|
||||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||||
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
||||||
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[32,64,128,256,512,1024], default=512)
|
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[32,64,128,256,512,1024], default=512)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue