minor
This commit is contained in:
parent
444b975edd
commit
d41cef9326
3 changed files with 20 additions and 28 deletions
|
@ -529,9 +529,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
||||||
params.n_gpu_layers = std::stoi(argv[i]);
|
params.n_gpu_layers = std::stoi(argv[i]);
|
||||||
#else
|
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
#endif
|
||||||
|
@ -540,9 +539,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
||||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||||
#else
|
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
#endif
|
||||||
|
@ -551,11 +549,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#else
|
#ifndef GGML_USE_CUBLAS
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
|
||||||
#endif
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -580,15 +577,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
|
|
||||||
// split string by , and /
|
// split string by , and /
|
||||||
const std::regex regex{R"([,/]+)"};
|
const std::regex regex{R"([,/]+)"};
|
||||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||||
std::vector<std::string> split_arg{it, {}};
|
std::vector<std::string> split_arg{it, {}};
|
||||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||||
if (i < split_arg.size()) {
|
if (i < split_arg.size()) {
|
||||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||||
|
@ -596,14 +594,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
params.tensor_split[i] = 0.0f;
|
params.tensor_split[i] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#ifndef GGML_USE_CUBLAS
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
|
||||||
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
params.mul_mat_q = false;
|
|
||||||
#else
|
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
|
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
|
|
4
llama.h
4
llama.h
|
@ -117,7 +117,7 @@ extern "C" {
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV to different GPUs
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||||
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -188,7 +188,7 @@ extern "C" {
|
||||||
// for small tensors and intermediate results (LLAMA_SPLIT_ROW)
|
// for small tensors and intermediate results (LLAMA_SPLIT_ROW)
|
||||||
// ignored for LLAMA_SPLIT_LAYER
|
// ignored for LLAMA_SPLIT_LAYER
|
||||||
int32_t main_gpu;
|
int32_t main_gpu;
|
||||||
// fraction of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||||
const float * tensor_split;
|
const float * tensor_split;
|
||||||
|
|
||||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
|
|
|
@ -474,17 +474,17 @@ struct test_case {
|
||||||
printf("compare failed ");
|
printf("compare failed ");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ud.ok && cmp_ok) {
|
|
||||||
printf("\033[1;32mOK\033[0m\n");
|
|
||||||
} else {
|
|
||||||
printf("\033[1;31mFAIL\033[0m\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_buffer_free(buf);
|
ggml_backend_buffer_free(buf);
|
||||||
|
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
|
|
||||||
return ud.ok;
|
if (ud.ok && cmp_ok) {
|
||||||
|
printf("\033[1;32mOK\033[0m\n");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\033[1;31mFAIL\033[0m\n");
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool eval_perf(ggml_backend_t backend, const char * op_name) {
|
bool eval_perf(ggml_backend_t backend, const char * op_name) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue