added support for tensor_split parameter as an advanced parameter.
This commit is contained in:
parent
66328fcd80
commit
c7136f03d9
3 changed files with 33 additions and 1 deletions
2
expose.h
2
expose.h
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
const int stop_token_max = 10;
|
const int stop_token_max = 10;
|
||||||
const int ban_token_max = 10;
|
const int ban_token_max = 10;
|
||||||
|
const int tensor_split_max = 16;
|
||||||
// match kobold's sampler list and order
|
// match kobold's sampler list and order
|
||||||
enum samplers
|
enum samplers
|
||||||
{
|
{
|
||||||
|
@ -46,6 +47,7 @@ struct load_model_inputs
|
||||||
const float rope_freq_scale = 1.0f;
|
const float rope_freq_scale = 1.0f;
|
||||||
const float rope_freq_base = 10000.0f;
|
const float rope_freq_base = 10000.0f;
|
||||||
const char * banned_tokens[ban_token_max];
|
const char * banned_tokens[ban_token_max];
|
||||||
|
const float tensor_split[tensor_split_max];
|
||||||
};
|
};
|
||||||
struct generation_inputs
|
struct generation_inputs
|
||||||
{
|
{
|
||||||
|
|
|
@ -475,6 +475,21 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||||
llama_ctx_params.n_batch = blasbatchsize;
|
llama_ctx_params.n_batch = blasbatchsize;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
bool ts_all_zero = true;
|
||||||
|
for (int i = 0; i < tensor_split_max; ++i) {
|
||||||
|
if (inputs.tensor_split[i] != 0.0f) {
|
||||||
|
ts_all_zero = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(!ts_all_zero)
|
||||||
|
{
|
||||||
|
llama_ctx_params.tensor_split = inputs.tensor_split;
|
||||||
|
printf("CUBLAS: Applying Custom Tensor Split!\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
||||||
|
|
||||||
if (llama_ctx_v3 == NULL)
|
if (llama_ctx_v3 == NULL)
|
||||||
|
|
17
koboldcpp.py
17
koboldcpp.py
|
@ -14,6 +14,7 @@ from concurrent.futures import ThreadPoolExecutor
|
||||||
stop_token_max = 10
|
stop_token_max = 10
|
||||||
sampler_order_max = 7
|
sampler_order_max = 7
|
||||||
ban_token_max = 10
|
ban_token_max = 10
|
||||||
|
tensor_split_max = 16
|
||||||
|
|
||||||
class load_model_inputs(ctypes.Structure):
|
class load_model_inputs(ctypes.Structure):
|
||||||
_fields_ = [("threads", ctypes.c_int),
|
_fields_ = [("threads", ctypes.c_int),
|
||||||
|
@ -38,7 +39,8 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("gpulayers", ctypes.c_int),
|
("gpulayers", ctypes.c_int),
|
||||||
("rope_freq_scale", ctypes.c_float),
|
("rope_freq_scale", ctypes.c_float),
|
||||||
("rope_freq_base", ctypes.c_float),
|
("rope_freq_base", ctypes.c_float),
|
||||||
("banned_tokens", ctypes.c_char_p * ban_token_max)]
|
("banned_tokens", ctypes.c_char_p * ban_token_max),
|
||||||
|
("tensor_split", ctypes.c_float * tensor_split_max)]
|
||||||
|
|
||||||
class generation_inputs(ctypes.Structure):
|
class generation_inputs(ctypes.Structure):
|
||||||
_fields_ = [("seed", ctypes.c_int),
|
_fields_ = [("seed", ctypes.c_int),
|
||||||
|
@ -208,6 +210,13 @@ def load_model(model_filename):
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||||
elif (args.usecublas and "2" in args.usecublas):
|
elif (args.usecublas and "2" in args.usecublas):
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
||||||
|
|
||||||
|
for n in range(tensor_split_max):
|
||||||
|
if args.has_advanced=='advanced' and args.tensor_split and n < len(args.tensor_split):
|
||||||
|
inputs.tensor_split[n] = float(args.tensor_split[n])
|
||||||
|
else:
|
||||||
|
inputs.tensor_split[n] = 0
|
||||||
|
|
||||||
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
||||||
inputs.debugmode = args.debugmode
|
inputs.debugmode = args.debugmode
|
||||||
banned_tokens = args.bantokens
|
banned_tokens = args.bantokens
|
||||||
|
@ -1634,5 +1643,11 @@ if __name__ == '__main__':
|
||||||
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
||||||
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2'])
|
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2'])
|
||||||
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
|
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
|
||||||
|
|
||||||
|
# for the seldom used esoteric commands
|
||||||
|
subparsers = parser.add_subparsers(title="Advanced Configs (For Experts)", dest="has_advanced")
|
||||||
|
advanced_subparser = subparsers.add_parser("advanced", help="Additional settings for experts. Run 'koboldcpp.py advanced --help' for more info")
|
||||||
|
advanced_subparser.add_argument("--tensor_split", help="CUDA with ALL set only. How to split tensors across multiple GPUs, space-separated list of proportions, e.g. 3 1", type=float, nargs='+')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue