minor
This commit is contained in:
parent
444b975edd
commit
d41cef9326
3 changed files with 20 additions and 28 deletions
|
@ -529,9 +529,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
|
@ -540,9 +539,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||
#else
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
|
@ -551,11 +549,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
||||
#endif
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -580,15 +577,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
|
||||
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
|
@ -596,14 +594,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.mul_mat_q = false;
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
|
|
4
llama.h
4
llama.h
|
@ -117,7 +117,7 @@ extern "C" {
|
|||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV to different GPUs
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
||||
};
|
||||
|
||||
|
@ -188,7 +188,7 @@ extern "C" {
|
|||
// for small tensors and intermediate results (LLAMA_SPLIT_ROW)
|
||||
// ignored for LLAMA_SPLIT_LAYER
|
||||
int32_t main_gpu;
|
||||
// fraction of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||
const float * tensor_split;
|
||||
|
||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
|
|
|
@ -474,17 +474,17 @@ struct test_case {
|
|||
printf("compare failed ");
|
||||
}
|
||||
|
||||
if (ud.ok && cmp_ok) {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
} else {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
}
|
||||
|
||||
ggml_backend_buffer_free(buf);
|
||||
|
||||
ggml_free(ctx);
|
||||
|
||||
return ud.ok;
|
||||
if (ud.ok && cmp_ok) {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool eval_perf(ggml_backend_t backend, const char * op_name) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue