Add flag for lowvram directly into cublas launch param
Merge remote-tracking branch 'yellowrose/pr/open/LostRuins/koboldcpp/lowvram' into concedo_experimental # Conflicts: # koboldcpp.py
This commit is contained in:
commit
4b3a1282f0
3 changed files with 7 additions and 3 deletions
1
expose.h
1
expose.h
|
@ -8,6 +8,7 @@ struct load_model_inputs
|
||||||
const int max_context_length;
|
const int max_context_length;
|
||||||
const int batch_size;
|
const int batch_size;
|
||||||
const bool f16_kv;
|
const bool f16_kv;
|
||||||
|
const bool low_vram;
|
||||||
const char * executable_path;
|
const char * executable_path;
|
||||||
const char * model_filename;
|
const char * model_filename;
|
||||||
const char * lora_filename;
|
const char * lora_filename;
|
||||||
|
|
|
@ -377,6 +377,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
//llama_ctx_paran_parts = -1;
|
//llama_ctx_paran_parts = -1;
|
||||||
llama_ctx_params.seed = -1;
|
llama_ctx_params.seed = -1;
|
||||||
llama_ctx_params.f16_kv = inputs.f16_kv;
|
llama_ctx_params.f16_kv = inputs.f16_kv;
|
||||||
|
llama_ctx_params.low_vram = inputs.low_vram;
|
||||||
llama_ctx_params.logits_all = false;
|
llama_ctx_params.logits_all = false;
|
||||||
llama_ctx_params.use_mmap = inputs.use_mmap;
|
llama_ctx_params.use_mmap = inputs.use_mmap;
|
||||||
llama_ctx_params.use_mlock = inputs.use_mlock;
|
llama_ctx_params.use_mlock = inputs.use_mlock;
|
||||||
|
|
|
@ -16,6 +16,7 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("max_context_length", ctypes.c_int),
|
("max_context_length", ctypes.c_int),
|
||||||
("batch_size", ctypes.c_int),
|
("batch_size", ctypes.c_int),
|
||||||
("f16_kv", ctypes.c_bool),
|
("f16_kv", ctypes.c_bool),
|
||||||
|
("low_vram", ctypes.c_bool),
|
||||||
("executable_path", ctypes.c_char_p),
|
("executable_path", ctypes.c_char_p),
|
||||||
("model_filename", ctypes.c_char_p),
|
("model_filename", ctypes.c_char_p),
|
||||||
("lora_filename", ctypes.c_char_p),
|
("lora_filename", ctypes.c_char_p),
|
||||||
|
@ -89,7 +90,7 @@ def init_library():
|
||||||
use_clblast = False #uses CLBlast instead
|
use_clblast = False #uses CLBlast instead
|
||||||
use_cublas = False #uses cublas instead
|
use_cublas = False #uses cublas instead
|
||||||
use_noavx2 = False #uses openblas with no avx2 instructions
|
use_noavx2 = False #uses openblas with no avx2 instructions
|
||||||
|
print(args.usecublas)
|
||||||
if args.noavx2:
|
if args.noavx2:
|
||||||
use_noavx2 = True
|
use_noavx2 = True
|
||||||
if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")):
|
if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")):
|
||||||
|
@ -105,7 +106,7 @@ def init_library():
|
||||||
else:
|
else:
|
||||||
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
|
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
|
||||||
use_clblast = True
|
use_clblast = True
|
||||||
elif args.usecublas:
|
elif (args.usecublas and args.usecublas!=""):
|
||||||
if not file_exists(lib_cublas):
|
if not file_exists(lib_cublas):
|
||||||
print("Warning: CuBLAS library file not found. Non-BLAS library will be used.")
|
print("Warning: CuBLAS library file not found. Non-BLAS library will be used.")
|
||||||
else:
|
else:
|
||||||
|
@ -160,6 +161,7 @@ def load_model(model_filename):
|
||||||
inputs.batch_size = 8
|
inputs.batch_size = 8
|
||||||
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
|
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
|
||||||
inputs.threads = args.threads
|
inputs.threads = args.threads
|
||||||
|
inputs.low_vram = (True if args.usecublas=="lowvram" else False)
|
||||||
inputs.blasthreads = args.blasthreads
|
inputs.blasthreads = args.blasthreads
|
||||||
inputs.f16_kv = True
|
inputs.f16_kv = True
|
||||||
inputs.use_mmap = (not args.nommap)
|
inputs.use_mmap = (not args.nommap)
|
||||||
|
@ -874,7 +876,7 @@ if __name__ == '__main__':
|
||||||
compatgroup = parser.add_mutually_exclusive_group()
|
compatgroup = parser.add_mutually_exclusive_group()
|
||||||
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
||||||
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
||||||
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU.", action='store_true')
|
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU. Select lowvram to not allocate VRAM scratch buffer.", default='', const='normal', nargs='?', choices=['normal', 'lowvram'])
|
||||||
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
|
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue