diff --git a/common/common.cpp b/common/common.cpp index d924c80dc..cae17a3d2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -162,18 +162,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + params.n_threads.resize(split_arg.size()); + for (size_t i = 0; i < split_arg.size(); ++i) { + params.n_threads[i] = std::stoi(split_arg[i]); + if (params.n_threads[i] <= 0) { + params.n_threads[i] = std::thread::hardware_concurrency(); + } } + } else if (arg == "-tb" || arg == "--threads-batch") { if (++i >= argc) { invalid_param = true; break; } - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + params.n_threads_batch.resize(split_arg.size()); + for (size_t i = 0; i < split_arg.size(); ++i) { + params.n_threads_batch[i] = std::stoi(split_arg[i]); + if (params.n_threads_batch[i] <= 0) { + params.n_threads_batch[i] = std::thread::hardware_concurrency(); + } } } else if (arg == "-td" || arg == "--threads-draft") { if (++i >= argc) { @@ -976,7 +995,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" (can be specified more than once for multiple prompts).\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); + printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads[0]); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" -td N, --threads-draft N"); @@ -1135,9 +1154,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { std::string get_system_info(const gpt_params & params) { std::ostringstream os; - os << "system_info: n_threads = " << params.n_threads; - if (params.n_threads_batch != -1) { - os << " (n_threads_batch = " << params.n_threads_batch << ")"; + os << "system_info: n_threads = " << params.n_threads[0]; + if (params.n_threads_batch[0] != -1) { + os << " (n_threads_batch = " << params.n_threads_batch[0] << ")"; } os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); @@ -1318,8 +1337,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_seq_max = params.n_parallel; cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.n_threads = params.n_threads[0]; + cparams.n_threads_batch = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0]; cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; @@ -1363,6 +1382,7 @@ void llama_batch_add( } std::tuple llama_init_from_gpt_params(gpt_params & params) { + int32_t n_threads = params.n_threads[0]; auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); @@ -1380,6 +1400,16 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } +#ifdef GGML_USE_MPI + int node_id = llama_node_id(lctx); + n_threads = (node_id >= params.n_threads.size()) ? get_num_physical_cores() : params.n_threads[node_id]; + int32_t n_threads_batch = (node_id >= params.n_threads_batch.size()) ? -1 : params.n_threads_batch[node_id]; + + params.n_threads[0] = n_threads; // So we can treat index 0 as what our n_threads is elsewhere + params.n_threads_batch[0] = n_threads_batch; + llama_set_n_threads(lctx, n_threads, (n_threads_batch > 0) ? n_threads_batch : get_num_physical_cores()); +#endif + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -1389,7 +1419,7 @@ std::tuple llama_init_from_gpt_par ((i > 0) || params.lora_base.empty()) ? NULL : params.lora_base.c_str(), - params.n_threads); + n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); @@ -1806,7 +1836,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads[0], std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); diff --git a/common/common.h b/common/common.h index f3b913d9d..9c7af1ee5 100644 --- a/common/common.h +++ b/common/common.h @@ -44,11 +44,10 @@ int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - - int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; + std::vector n_threads = {get_num_physical_cores()}; + std::vector n_threads_batch = {-1}; // number of threads to use for batch processing (-1 = use n_threads) + std::vector n_threads_draft = {get_num_physical_cores()}; + std::vector n_threads_batch_draft = {-1}; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e2d07a631..b4b3f8a6c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -207,6 +207,8 @@ int main(int argc, char ** argv) { return 1; } + llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size()); + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); LOG("n_ctx: %d\n", n_ctx); diff --git a/llama.cpp b/llama.cpp index ec3707ff2..2bdb38434 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12668,6 +12668,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { return result; } +int llama_node_id(struct llama_context * ctx) { +#ifdef GGML_USE_MPI + return ggml_mpi_rank(ctx->ctx_mpi); + +#endif + return 0; +} + size_t llama_max_devices(void) { #if defined(GGML_USE_METAL) return 1; diff --git a/llama.h b/llama.h index fe6ba4a47..818056064 100644 --- a/llama.h +++ b/llama.h @@ -372,6 +372,9 @@ extern "C" { LLAMA_API size_t llama_max_devices(void); + // Get the ID of this compute node, usually 0 + // unless running MPI, in which case it is the rank of the node + LLAMA_API int llama_node_id(struct llama_context * ctx); LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mlock (void); LLAMA_API bool llama_supports_gpu_offload(void);