Allow per-node threads to be set in command-line args, add mpi support to main
This commit is contained in:
parent
32078d6fe1
commit
c9d18263b3
5 changed files with 61 additions and 19 deletions
|
@ -162,18 +162,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_threads = std::stoi(argv[i]);
|
std::string arg_next = argv[i];
|
||||||
if (params.n_threads <= 0) {
|
|
||||||
params.n_threads = std::thread::hardware_concurrency();
|
// split string by , and /
|
||||||
|
const std::regex regex{R"([,/]+)"};
|
||||||
|
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||||
|
std::vector<std::string> split_arg{it, {}};
|
||||||
|
params.n_threads.resize(split_arg.size());
|
||||||
|
for (size_t i = 0; i < split_arg.size(); ++i) {
|
||||||
|
params.n_threads[i] = std::stoi(split_arg[i]);
|
||||||
|
if (params.n_threads[i] <= 0) {
|
||||||
|
params.n_threads[i] = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} else if (arg == "-tb" || arg == "--threads-batch") {
|
} else if (arg == "-tb" || arg == "--threads-batch") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_threads_batch = std::stoi(argv[i]);
|
std::string arg_next = argv[i];
|
||||||
if (params.n_threads_batch <= 0) {
|
|
||||||
params.n_threads_batch = std::thread::hardware_concurrency();
|
// split string by , and /
|
||||||
|
const std::regex regex{R"([,/]+)"};
|
||||||
|
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||||
|
std::vector<std::string> split_arg{it, {}};
|
||||||
|
params.n_threads_batch.resize(split_arg.size());
|
||||||
|
for (size_t i = 0; i < split_arg.size(); ++i) {
|
||||||
|
params.n_threads_batch[i] = std::stoi(split_arg[i]);
|
||||||
|
if (params.n_threads_batch[i] <= 0) {
|
||||||
|
params.n_threads_batch[i] = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (arg == "-td" || arg == "--threads-draft") {
|
} else if (arg == "-td" || arg == "--threads-draft") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -976,7 +995,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" (can be specified more than once for multiple prompts).\n");
|
printf(" (can be specified more than once for multiple prompts).\n");
|
||||||
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads[0]);
|
||||||
printf(" -tb N, --threads-batch N\n");
|
printf(" -tb N, --threads-batch N\n");
|
||||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" -td N, --threads-draft N");
|
printf(" -td N, --threads-draft N");
|
||||||
|
@ -1135,9 +1154,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
std::string get_system_info(const gpt_params & params) {
|
std::string get_system_info(const gpt_params & params) {
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
|
|
||||||
os << "system_info: n_threads = " << params.n_threads;
|
os << "system_info: n_threads = " << params.n_threads[0];
|
||||||
if (params.n_threads_batch != -1) {
|
if (params.n_threads_batch[0] != -1) {
|
||||||
os << " (n_threads_batch = " << params.n_threads_batch << ")";
|
os << " (n_threads_batch = " << params.n_threads_batch[0] << ")";
|
||||||
}
|
}
|
||||||
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
||||||
|
|
||||||
|
@ -1318,8 +1337,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.n_seq_max = params.n_parallel;
|
cparams.n_seq_max = params.n_parallel;
|
||||||
cparams.n_batch = params.n_batch;
|
cparams.n_batch = params.n_batch;
|
||||||
cparams.n_ubatch = params.n_ubatch;
|
cparams.n_ubatch = params.n_ubatch;
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads[0];
|
||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
|
||||||
cparams.seed = params.seed;
|
cparams.seed = params.seed;
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
|
@ -1363,6 +1382,7 @@ void llama_batch_add(
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
int32_t n_threads = params.n_threads[0];
|
||||||
auto mparams = llama_model_params_from_gpt_params(params);
|
auto mparams = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
|
@ -1380,6 +1400,16 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return std::make_tuple(nullptr, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
int node_id = llama_node_id(lctx);
|
||||||
|
n_threads = (node_id >= params.n_threads.size()) ? get_num_physical_cores() : params.n_threads[node_id];
|
||||||
|
int32_t n_threads_batch = (node_id >= params.n_threads_batch.size()) ? -1 : params.n_threads_batch[node_id];
|
||||||
|
|
||||||
|
params.n_threads[0] = n_threads; // So we can treat index 0 as what our n_threads is elsewhere
|
||||||
|
params.n_threads_batch[0] = n_threads_batch;
|
||||||
|
llama_set_n_threads(lctx, n_threads, (n_threads_batch > 0) ? n_threads_batch : get_num_physical_cores());
|
||||||
|
#endif
|
||||||
|
|
||||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
||||||
const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
|
const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
|
||||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
||||||
|
@ -1389,7 +1419,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
((i > 0) || params.lora_base.empty())
|
((i > 0) || params.lora_base.empty())
|
||||||
? NULL
|
? NULL
|
||||||
: params.lora_base.c_str(),
|
: params.lora_base.c_str(),
|
||||||
params.n_threads);
|
n_threads);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
|
@ -1806,7 +1836,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||||
|
|
||||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||||
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %u\n", params.n_threads[0], std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
|
|
|
@ -44,11 +44,10 @@ int32_t get_num_physical_cores();
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
std::vector<int32_t> n_threads = {get_num_physical_cores()};
|
||||||
int32_t n_threads = get_num_physical_cores();
|
std::vector<int32_t> n_threads_batch = {-1}; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_threads_draft = -1;
|
std::vector<int32_t> n_threads_draft = {get_num_physical_cores()};
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
std::vector<int32_t> n_threads_batch_draft = {-1}; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_threads_batch_draft = -1;
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
|
|
@ -207,6 +207,8 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
LOG("n_ctx: %d\n", n_ctx);
|
LOG("n_ctx: %d\n", n_ctx);
|
||||||
|
|
|
@ -12668,6 +12668,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int llama_node_id(struct llama_context * ctx) {
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
return ggml_mpi_rank(ctx->ctx_mpi);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
size_t llama_max_devices(void) {
|
size_t llama_max_devices(void) {
|
||||||
#if defined(GGML_USE_METAL)
|
#if defined(GGML_USE_METAL)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -372,6 +372,9 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
|
|
||||||
|
// Get the ID of this compute node, usually 0
|
||||||
|
// unless running MPI, in which case it is the rank of the node
|
||||||
|
LLAMA_API int llama_node_id(struct llama_context * ctx);
|
||||||
LLAMA_API bool llama_supports_mmap (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_supports_mlock (void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
LLAMA_API bool llama_supports_gpu_offload(void);
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue