diff --git a/common/common.cpp b/common/common.cpp
index d924c80dc..cae17a3d2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -162,18 +162,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads = std::stoi(argv[i]);
-            if (params.n_threads <= 0) {
-                params.n_threads = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads.resize(split_arg.size());
+            for (size_t i = 0; i < split_arg.size(); ++i) {
+                params.n_threads[i] = std::stoi(split_arg[i]);
+                if (params.n_threads[i] <= 0) {
+                    params.n_threads[i] = std::thread::hardware_concurrency();
+                }
             }
+
         } else if (arg == "-tb" || arg == "--threads-batch") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads_batch = std::stoi(argv[i]);
-            if (params.n_threads_batch <= 0) {
-                params.n_threads_batch = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads_batch.resize(split_arg.size());
+            for (size_t i = 0; i < split_arg.size(); ++i) {
+                params.n_threads_batch[i] = std::stoi(split_arg[i]);
+                if (params.n_threads_batch[i] <= 0) {
+                    params.n_threads_batch[i] = std::thread::hardware_concurrency();
+                }
             }
         } else if (arg == "-td" || arg == "--threads-draft") {
             if (++i >= argc) {
@@ -976,7 +995,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        (can be specified more than once for multiple prompts).\n");
     printf("  --color               colorise output to distinguish prompt and user input from generations\n");
     printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads[0]);
     printf("  -tb N, --threads-batch N\n");
     printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
     printf("  -td N, --threads-draft N");
@@ -1135,9 +1154,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 std::string get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.n_threads[0];
+    if (params.n_threads_batch[0] != -1) {
+        os << " (n_threads_batch = " << params.n_threads_batch[0] << ")";
     }
     os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
 
@@ -1318,8 +1337,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_seq_max         = params.n_parallel;
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.n_threads         = params.n_threads[0];
+    cparams.n_threads_batch   = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
@@ -1363,6 +1382,7 @@ void llama_batch_add(
 }
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
+    int32_t n_threads = params.n_threads[0];
     auto mparams = llama_model_params_from_gpt_params(params);
 
     llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
@@ -1380,6 +1400,16 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
+#ifdef GGML_USE_MPI
+    int node_id = llama_node_id(lctx);
+    n_threads = (node_id >= params.n_threads.size()) ? get_num_physical_cores() : params.n_threads[node_id];
+    int32_t n_threads_batch = (node_id >= params.n_threads_batch.size()) ? -1 : params.n_threads_batch[node_id];
+
+    params.n_threads[0] = n_threads; // So we can treat index 0 as what our n_threads is elsewhere
+    params.n_threads_batch[0] = n_threads_batch;
+    llama_set_n_threads(lctx, n_threads, (n_threads_batch > 0) ? n_threads_batch : get_num_physical_cores());
+#endif
+
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -1389,7 +1419,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
                                              ((i > 0) || params.lora_base.empty())
                                                 ? NULL
                                                 : params.lora_base.c_str(),
-                                             params.n_threads);
+                                             n_threads);
         if (err != 0) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
@@ -1806,7 +1836,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.n_threads[0], std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h
index f3b913d9d..9c7af1ee5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -44,11 +44,10 @@ int32_t get_num_physical_cores();
 
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
-
-    int32_t n_threads             = get_num_physical_cores();
-    int32_t n_threads_draft       = -1;
-    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft = -1;
+    std::vector<int32_t> n_threads                       = {get_num_physical_cores()};
+    std::vector<int32_t> n_threads_batch                 = {-1};    // number of threads to use for batch processing (-1 = use n_threads)
+    std::vector<int32_t> n_threads_draft                 = {get_num_physical_cores()};
+    std::vector<int32_t> n_threads_batch_draft           = {-1};    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size
     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index e2d07a631..b4b3f8a6c 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -207,6 +207,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
diff --git a/llama.cpp b/llama.cpp
index ec3707ff2..2bdb38434 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12668,6 +12668,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
     return result;
 }
 
+int llama_node_id(struct llama_context * ctx) {
+#ifdef GGML_USE_MPI
+    return ggml_mpi_rank(ctx->ctx_mpi);
+
+#endif
+    return 0;
+}
+
 size_t llama_max_devices(void) {
 #if defined(GGML_USE_METAL)
     return 1;
diff --git a/llama.h b/llama.h
index fe6ba4a47..818056064 100644
--- a/llama.h
+++ b/llama.h
@@ -372,6 +372,9 @@ extern "C" {
 
     LLAMA_API size_t llama_max_devices(void);
 
+    // Get the ID of this compute node, usually 0
+    // unless running MPI, in which case it is the rank of the node
+    LLAMA_API int llama_node_id(struct llama_context * ctx);
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
     LLAMA_API bool llama_supports_gpu_offload(void);