Allow per-node threads to be set in command-line args, add mpi support to main

This commit is contained in:
Branden Butler 2023-11-01 14:55:32 -05:00
parent 32078d6fe1
commit c9d18263b3
5 changed files with 61 additions and 19 deletions

View file

@ -162,18 +162,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
std::string arg_next = argv[i];
// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
params.n_threads.resize(split_arg.size());
for (size_t i = 0; i < split_arg.size(); ++i) {
params.n_threads[i] = std::stoi(split_arg[i]);
if (params.n_threads[i] <= 0) {
params.n_threads[i] = std::thread::hardware_concurrency();
}
}
} else if (arg == "-tb" || arg == "--threads-batch") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_batch = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
std::string arg_next = argv[i];
// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
params.n_threads_batch.resize(split_arg.size());
for (size_t i = 0; i < split_arg.size(); ++i) {
params.n_threads_batch[i] = std::stoi(split_arg[i]);
if (params.n_threads_batch[i] <= 0) {
params.n_threads_batch[i] = std::thread::hardware_concurrency();
}
}
} else if (arg == "-td" || arg == "--threads-draft") {
if (++i >= argc) {
@ -976,7 +995,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" (can be specified more than once for multiple prompts).\n");
printf(" --color colorise output to distinguish prompt and user input from generations\n");
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads[0]);
printf(" -tb N, --threads-batch N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" -td N, --threads-draft N");
@ -1135,9 +1154,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
std::string get_system_info(const gpt_params & params) {
std::ostringstream os;
os << "system_info: n_threads = " << params.n_threads;
if (params.n_threads_batch != -1) {
os << " (n_threads_batch = " << params.n_threads_batch << ")";
os << "system_info: n_threads = " << params.n_threads[0];
if (params.n_threads_batch[0] != -1) {
os << " (n_threads_batch = " << params.n_threads_batch[0] << ")";
}
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
@ -1318,8 +1337,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.n_threads = params.n_threads[0];
cparams.n_threads_batch = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
@ -1363,6 +1382,7 @@ void llama_batch_add(
}
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
int32_t n_threads = params.n_threads[0];
auto mparams = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
@ -1380,6 +1400,16 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
return std::make_tuple(nullptr, nullptr);
}
#ifdef GGML_USE_MPI
int node_id = llama_node_id(lctx);
n_threads = (node_id >= params.n_threads.size()) ? get_num_physical_cores() : params.n_threads[node_id];
int32_t n_threads_batch = (node_id >= params.n_threads_batch.size()) ? -1 : params.n_threads_batch[node_id];
params.n_threads[0] = n_threads; // So we can treat index 0 as what our n_threads is elsewhere
params.n_threads_batch[0] = n_threads_batch;
llama_set_n_threads(lctx, n_threads, (n_threads_batch > 0) ? n_threads_batch : get_num_physical_cores());
#endif
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
@ -1389,7 +1419,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
((i > 0) || params.lora_base.empty())
? NULL
: params.lora_base.c_str(),
params.n_threads);
n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
@ -1806,7 +1836,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "threads: %d # default: %u\n", params.n_threads[0], std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);

View file

@ -44,11 +44,10 @@ int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
std::vector<int32_t> n_threads = {get_num_physical_cores()};
std::vector<int32_t> n_threads_batch = {-1}; // number of threads to use for batch processing (-1 = use n_threads)
std::vector<int32_t> n_threads_draft = {get_num_physical_cores()};
std::vector<int32_t> n_threads_batch_draft = {-1}; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)

View file

@ -207,6 +207,8 @@ int main(int argc, char ** argv) {
return 1;
}
llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);

View file

@ -12668,6 +12668,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return result;
}
int llama_node_id(struct llama_context * ctx) {
#ifdef GGML_USE_MPI
return ggml_mpi_rank(ctx->ctx_mpi);
#endif
return 0;
}
size_t llama_max_devices(void) {
#if defined(GGML_USE_METAL)
return 1;

View file

@ -372,6 +372,9 @@ extern "C" {
LLAMA_API size_t llama_max_devices(void);
// Get the ID of this compute node, usually 0
// unless running MPI, in which case it is the rank of the node
LLAMA_API int llama_node_id(struct llama_context * ctx);
LLAMA_API bool llama_supports_mmap (void);
LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void);