Refactor MPI for heterogenous cluster support.

Adds support for different options and number of layers per node. The per-node options are implemented as parsing command-line options from a file instead of from the command-line itself. This allows each node to have its own version of this options file. The different number of layers per-node is implemented as a new option, `mpi-layer-split`, that takes a list of percentages. These percentages are used to calculate the range of layers to delegate to each node. The ranges are calculated on the head node and then scattered to the other nodes to maintain a single source of truth.
2023-09-24 10:05:34 -05:00 · 2023-09-24 10:05:34 -05:00 · 3ca1ca0182
commit 3ca1ca0182
parent 4755afd1cb
9 changed files with 1187 additions and 33 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -663,6 +663,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
 #endif // GGML_USE_CUBLAS_SYCL

+        } else if (arg == "--mpi-layer-split") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.mpi_layer_split.resize(split_arg.size());
+            for (size_t node = 0; node < split_arg.size(); ++node) {
+                params.mpi_layer_split[node] = std::stof(split_arg[node]);
+            }
+
+
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
--- a/common/common.h
+++ b/common/common.h
@ -62,6 +62,7 @@ struct gpt_params {
    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    std::vector<float> mpi_layer_split      = {1.0}; // list of percentages of the total number of layers
    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
--- a/examples/mpi/CMakeLists.txt
+++ b/examples/mpi/CMakeLists.txt
@ -0,0 +1,8 @@
+set(TARGET mpi)
+add_executable(${TARGET} mpi.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/mpi/README.md
+++ b/examples/mpi/README.md
@ -0,0 +1,80 @@
+# llama.cpp/example/mpi
+
+This example program allows you to use various LLaMA language models in an easy and efficient way across an MPI cluster.
+It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Common Options](#common-options)
+
+## Quick Start
+
+To get started right away, write the following to a file on each node, making sure to use the correct path for the model you have:
+```bash
+--mpi-layer-split 0.8,0.2 -t 4 -m ~/llm-local/codellama-7b.Q3_K_M.gguf --color -c 512 --temp 0.0 --repeat_penalty 1.0 -n 128 -p "double fast_inverse_square_root(double x"
+```
+
+Each node may have different options, currently they must have the same number of arguments to the mpi-layer-split option and the same
+model path, but that will eventually be synchronized from the head node.
+
+Next, write the hostsfile on the head node. Make sure there is only one slot on each node.
+
+Finally, run the following command on the head node to start the program across the cluster:
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+mpirun -hostfile hostsfile -mca orte_keep_fqdn_hostnames t --bind-to none ./mpi options.txt
+```
+
+Where `hostsfile` is the file containing the cluster hostname configuration and `options.txt` is the path
+where each node can find its own options. Storing the model on a network filesystem has not yet been
+tested and optimized for.
+
+#### Windows:
+Not supported currently.
+
+For an interactive experience, try this command:
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
+'User: Hi
+AI: Hello. I am an AI chatbot. Would you like to talk?
+User: Sure!
+AI: What would you like to talk about?
+User:'
+```
+
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+```
+
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `mpi` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `--mpi-layer-split`: Set the percentage of layers to distribute to each node. Must have the same number of arguments as the number of nodes in the cluster. Only the layer split percentages passed to the head node are used, they are scattered to all other nodes in the cluster.
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@ -0,0 +1,945 @@
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "common.h"
+#include "console.h"
+#include "llama.h"
+#include "build-info.h"
+#include "grammar-parser.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <wordexp.h>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+
+static llama_context           ** g_ctx;
+static llama_model             ** g_model;
+static gpt_params               * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream       * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
+static bool is_interacting = false;
+
+void write_logfile(
+    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
+
+    if (params.logdir.empty()) {
+        return;
+    }
+
+    const std::string timestamp = get_sortable_timestamp();
+
+    const bool success = create_directory_with_parents(params.logdir);
+    if (!success) {
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                __func__, params.logdir.c_str());
+        return;
+    }
+
+    const std::string logfile_path = params.logdir + timestamp + ".yml";
+    FILE * logfile = fopen(logfile_path.c_str(), "w");
+
+    if (logfile == NULL) {
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        return;
+    }
+
+    fprintf(logfile, "binary: main\n");
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+
+    fprintf(logfile, "\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "# Generation Results #\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "\n");
+
+    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+
+    llama_dump_timing_info_yaml(logfile, ctx);
+    fclose(logfile);
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (!is_interacting) {
+            is_interacting = true;
+        } else {
+            console::cleanup();
+            printf("\n");
+            llama_print_timings(*g_ctx);
+            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            _exit(130);
+        }
+    }
+}
+#endif
+
+int main(int argc, char ** argv) {
+
+    gpt_params params;
+    g_params = &params;
+
+    if (argc > 2) {
+        fprintf(stderr, "Must only have one argument, the file to read options from.\n");
+        return 2;
+    }
+
+    std::string rawOptions = argv[0];
+    rawOptions += ' ';
+    std::ifstream optionsFile(argv[1]);
+    if (optionsFile.is_open()) {
+        std::ostringstream buf;
+        buf << optionsFile.rdbuf();
+        rawOptions += buf.str();
+        optionsFile.close();
+
+    } else {
+        fprintf(stderr, "Cannot open options file at path %s\n", argv[1]);
+        return 3;
+    }
+
+    rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
+
+    printf("%s", rawOptions.c_str());
+
+    wordexp_t  splitOptions;
+    wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
+    //char** loadedArgs = (char **) malloc(1 + sizeof(char*) * splitOptions.we_wordc);
+    //loadedArgs[0] = argv[0];
+    //memcpy(&loadedArgs[1], splitOptions.we_wordv, sizeof(char*) * splitOptions.we_wordc);
+    printf("Loaded argc: %d", splitOptions.we_wordc);
+    for (int i = 0; i < splitOptions.we_wordc; i++) {
+
+        printf(" %s", splitOptions.we_wordv[i]);
+    }
+    printf("\n");
+
+    if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {
+        wordfree(&splitOptions);
+        return 1;
+    }
+    wordfree(&splitOptions);
+
+    // save choice to use color for later
+    // (note for later: this is a slightly awkward choice)
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
+    if (params.perplexity) {
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
+    if (params.embedding) {
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
+    if (params.rope_freq_base != 10000.0) {
+        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 1.0) {
+        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    }
+
+    if (params.n_ctx > 2048) {
+        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_backend_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+    llama_context * ctx_guidance = NULL;
+    g_model = &model;
+    g_ctx = &ctx;
+
+    // load the model and apply lora adapter, if any
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (params.cfg_scale > 1.f) {
+        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+        ctx_guidance = llama_new_context_with_model(model, lparams);
+    }
+
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+
+    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
+    // uncomment the "used_mem" line in llama.cpp to see the results
+    if (params.mem_test) {
+        {
+            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
+
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
+        }
+
+        llama_print_timings(ctx);
+        llama_free(ctx);
+        llama_free_model(model);
+
+        return 0;
+    }
+
+    // export the cgraph and exit
+    if (params.export_cgraph) {
+        llama_eval_export(ctx, "llama.ggml");
+        llama_free(ctx);
+        llama_free_model(model);
+
+        return 0;
+    }
+    llama_split_layers_weighted(ctx, params.mpi_layer_split);
+
+    std::string path_session = params.path_prompt_cache;
+    std::vector<llama_token> session_tokens;
+
+    if (!path_session.empty()) {
+        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+
+        // fopen to check for existing session
+        FILE * fp = std::fopen(path_session.c_str(), "rb");
+        if (fp != NULL) {
+            std::fclose(fp);
+
+            session_tokens.resize(params.n_ctx);
+            size_t n_token_count_out = 0;
+            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
+            session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);
+
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+        } else {
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+        }
+    }
+
+    // Add BOS if SPM tokenizer
+    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+
+    // tokenize the prompt
+    std::vector<llama_token> embd_inp;
+
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+    } else {
+        embd_inp = session_tokens;
+    }
+
+    // Should not run without any tokens
+    if (embd_inp.empty()) {
+        embd_inp.push_back(llama_token_bos(ctx));
+    }
+
+    // Tokenize negative prompt
+    std::vector<llama_token> guidance_inp;
+    int guidance_offset = 0;
+    int original_prompt_len = 0;
+    if (ctx_guidance) {
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        original_prompt_len = original_inp.size();
+        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+    }
+
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        return 1;
+    }
+
+    // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (session_tokens.size()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        }
+    }
+
+    // if we will use the cache for the full prompt without reaching the end of the cache, force
+    // reevaluation of the last token token to recalculate the cached logits
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+            session_tokens.size() > embd_inp.size()) {
+        session_tokens.resize(embd_inp.size() - 1);
+    }
+
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
+        params.n_keep = (int)embd_inp.size();
+    }
+
+    // prefix & suffix for instruct mode
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+
+    // in instruct mode, we inject a prefix and a suffix to each input by the user
+    if (params.instruct) {
+        params.interactive_first = true;
+        params.antiprompt.push_back("### Instruction:\n\n");
+    }
+
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
+        params.interactive = true;
+    }
+
+    if (params.verbose_prompt) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+        }
+
+        if (ctx_guidance) {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            for (int i = 0; i < (int) guidance_inp.size(); i++) {
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+            }
+        }
+
+        if (params.n_keep > 0) {
+        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.n_keep; i++) {
+                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            }
+            fprintf(stderr, "'\n");
+        }
+        fprintf(stderr, "\n");
+    }
+
+    if (params.interactive) {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+
+        if (params.antiprompt.size()) {
+            for (auto antiprompt : params.antiprompt) {
+                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+            }
+        }
+
+        if (params.input_prefix_bos) {
+            fprintf(stderr, "Input prefix with BOS\n");
+        }
+
+        if (!params.input_prefix.empty()) {
+            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+        }
+
+        if (!params.input_suffix.empty()) {
+            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+        }
+    }
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    fprintf(stderr, "\n\n");
+
+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *             grammar = NULL;
+    if (!params.grammar.empty()) {
+        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        // will be empty (default) if there are parse errors
+        if (parsed_grammar.rules.empty()) {
+            return 1;
+        }
+        fprintf(stderr, "%s: grammar:\n", __func__);
+        grammar_parser::print_grammar(stderr, parsed_grammar);
+        fprintf(stderr, "\n");
+
+        {
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            }
+        }
+
+        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+        grammar = llama_grammar_init(
+            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    }
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
+    if (params.interactive) {
+        const char *control_message;
+        if (params.multiline_input) {
+            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to LLaMa.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n"
+                              " - If you want to submit another line, end your input with '\\'.\n";
+        }
+        fprintf(stderr, "== Running in interactive mode. ==\n"
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+               " - Press Ctrl+C to interject at any time.\n"
+#endif
+               "%s\n", control_message);
+
+        is_interacting = params.interactive_first;
+    }
+
+    bool is_antiprompt        = false;
+    bool input_echo           = true;
+    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
+
+    int n_past             = 0;
+    int n_remain           = params.n_predict;
+    int n_consumed         = 0;
+    int n_session_consumed = 0;
+    int n_past_guidance    = 0;
+
+    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
+    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
+    std::ostringstream output_ss;     g_output_ss     = &output_ss;
+
+    // the first thing we will do is to output the prompt, so set color accordingly
+    console::set_display(console::prompt);
+
+    std::vector<llama_token> embd;
+    std::vector<llama_token> embd_guidance;
+
+    // do one empty run to warm up the model
+    {
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_reset_timings(ctx);
+    }
+
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
+        // predict
+        if (embd.size() > 0) {
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            auto max_embd_size = n_ctx - 4;
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int)embd.size() > max_embd_size) {
+                auto skipped_tokens = embd.size() - max_embd_size;
+                console::set_display(console::error);
+                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console::set_display(console::reset);
+                fflush(stdout);
+                embd.resize(max_embd_size);
+            }
+
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                if (params.n_predict == -2) {
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    break;
+                }
+
+                const int n_left = n_past - params.n_keep;
+                // always keep the first token - BOS
+                n_past = std::max(1, params.n_keep);
+                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
+
+                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+
+                // stop saving session if we run out of context
+                path_session.clear();
+
+                //printf("\n---\n");
+                //printf("resetting: '");
+                //for (int i = 0; i < (int) embd.size(); i++) {
+                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
+                //}
+                //printf("'\n");
+                //printf("\n---\n");
+            }
+
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+
+                    n_past++;
+                    n_session_consumed++;
+
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+
+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+
+            if (ctx_guidance) {
+                int input_size = 0;
+                llama_token* input_buf = NULL;
+
+                if (n_past_guidance < (int) guidance_inp.size()) {
+                    // Guidance context should have the same data with these modifications:
+                    //
+                    // * Replace the initial prompt
+                    // * Shift everything by guidance_offset
+                    embd_guidance = guidance_inp;
+                    if (embd.begin() + original_prompt_len < embd.end()) {
+                        embd_guidance.insert(
+                            embd_guidance.end(),
+                            embd.begin() + original_prompt_len,
+                            embd.end()
+                        );
+                    }
+
+                    input_buf = embd_guidance.data();
+                    input_size = embd_guidance.size();
+                    //fprintf(stderr, "\n---------------------\n");
+                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
+                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
+                    //}
+                    //fprintf(stderr, "\n---------------------\n");
+                } else {
+                    input_buf = embd.data();
+                    input_size = embd.size();
+                }
+
+                for (int i = 0; i < input_size; i += params.n_batch) {
+                    int n_eval = std::min(input_size - i, params.n_batch);
+                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past_guidance += n_eval;
+                }
+            }
+
+            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    return 1;
+                }
+                n_past += n_eval;
+            }
+
+            if (embd.size() > 0 && !path_session.empty()) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
+            }
+        }
+
+        embd.clear();
+        embd_guidance.clear();
+
+        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            // out of user input, sample next token
+            const float   temp            = params.temp;
+            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p           = params.top_p;
+            const float   tfs_z           = params.tfs_z;
+            const float   typical_p       = params.typical_p;
+            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+            const float   repeat_penalty  = params.repeat_penalty;
+            const float   alpha_presence  = params.presence_penalty;
+            const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat        = params.mirostat;
+            const float   mirostat_tau    = params.mirostat_tau;
+            const float   mirostat_eta    = params.mirostat_eta;
+            const bool    penalize_nl     = params.penalize_nl;
+
+            // optionally save the session on first sample (for faster prompt loading next time)
+            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
+                need_to_save_session = false;
+                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+            }
+
+            llama_token id = 0;
+
+            {
+                auto logits  = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);
+
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
+                }
+
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                }
+
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                if (ctx_guidance) {
+                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
+                }
+
+                // Apply penalties
+                float nl_logit = logits[llama_token_nl(ctx)];
+                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+                llama_sample_repetition_penalty(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, repeat_penalty);
+                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, alpha_frequency, alpha_presence);
+                if (!penalize_nl) {
+                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
+                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
+                            candidates_p.data[idx].logit = nl_logit;
+                            break;
+                        }
+                    }
+                }
+
+                if (grammar != NULL) {
+                    llama_sample_grammar(ctx, &candidates_p, grammar);
+                }
+
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    if (mirostat == 1) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        const int mirostat_m = 100;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                    } else if (mirostat == 2) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                    } else {
+                        // Temperature sampling
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token(ctx, &candidates_p);
+                    }
+                }
+                // printf("`%d`", candidates_p.size);
+
+                if (grammar != NULL) {
+                    llama_grammar_accept_token(ctx, grammar, id);
+                }
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(id);
+            }
+
+            // add it to the context
+            embd.push_back(id);
+
+            // echo this to console
+            input_echo = true;
+
+            // decrement remaining sampling budget
+            --n_remain;
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[n_consumed]);
+                ++n_consumed;
+                if ((int) embd.size() >= params.n_batch) {
+                    break;
+                }
+            }
+        }
+
+        // display text
+        if (input_echo) {
+            for (auto id : embd) {
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                printf("%s", token_str.c_str());
+
+                if (embd.size() > 1) {
+                    input_tokens.push_back(id);
+                } else {
+                    output_tokens.push_back(id);
+                    output_ss << token_str;
+                }
+            }
+            fflush(stdout);
+        }
+        // reset color to default if we there is no pending user input
+        if (input_echo && (int)embd_inp.size() == n_consumed) {
+            console::set_display(console::reset);
+        }
+
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {
+
+            // check for reverse prompt
+            if (params.antiprompt.size()) {
+                std::string last_output;
+                for (auto id : last_n_tokens) {
+                    last_output += llama_token_to_piece(ctx, id);
+                }
+
+                is_antiprompt = false;
+                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
+                for (std::string & antiprompt : params.antiprompt) {
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                        : 0;
+
+                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                            console::set_display(console::user_input);
+                        }
+                        is_antiprompt = true;
+                        fflush(stdout);
+                        break;
+                    }
+                }
+            }
+
+            // deal with end of text token in interactive mode
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
+                if (params.interactive) {
+                    if (params.antiprompt.size() != 0) {
+                        // tokenize and inject first reverse prompt
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                        is_antiprompt = true;
+                    }
+
+                    is_interacting = true;
+                    printf("\n");
+                    console::set_display(console::user_input);
+                    fflush(stdout);
+                } else if (params.instruct) {
+                    is_interacting = true;
+                }
+            }
+
+            if (n_past > 0 && is_interacting) {
+                if (params.instruct) {
+                    printf("\n> ");
+                }
+
+                if (params.input_prefix_bos) {
+                    embd_inp.push_back(llama_token_bos(ctx));
+                }
+
+                std::string buffer;
+                if (!params.input_prefix.empty()) {
+                    buffer += params.input_prefix;
+                    printf("%s", buffer.c_str());
+                }
+
+                std::string line;
+                bool another_line = true;
+                do {
+                    another_line = console::readline(line, params.multiline_input);
+                    buffer += line;
+                } while (another_line);
+
+                // done taking input, reset color
+                console::set_display(console::reset);
+
+                // Add tokens to embd only if the input buffer is non-empty
+                // Entering a empty line lets the user pass control back
+                if (buffer.length() > 1) {
+                    // append input suffix if any
+                    if (!params.input_suffix.empty()) {
+                        buffer += params.input_suffix;
+                        printf("%s", params.input_suffix.c_str());
+                    }
+
+                    const size_t original_size = embd_inp.size();
+
+                    // instruct mode: insert instruction prefix
+                    if (params.instruct && !is_antiprompt) {
+                        n_consumed = embd_inp.size();
+                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
+                    }
+
+                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+
+                    // instruct mode: insert response suffix
+                    if (params.instruct) {
+                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                    }
+
+                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
+                        const llama_token token = embd_inp[i];
+                        output_tokens.push_back(token);
+                        output_ss << llama_token_to_piece(ctx, token);
+                    }
+
+                    n_remain -= line_inp.size();
+                }
+
+                input_echo = false; // do not echo this again
+            }
+
+            if (n_past > 0) {
+                if (is_interacting) {
+                    // reset grammar state if we're restarting generation
+                    if (grammar != NULL) {
+                        llama_grammar_free(grammar);
+
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
+                        grammar = llama_grammar_init(
+                            grammar_rules.data(), grammar_rules.size(),
+                            parsed_grammar.symbol_ids.at("root"));
+                    }
+                }
+                is_interacting = false;
+            }
+        }
+
+        // end of text token
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+            fprintf(stderr, " [end of text]\n");
+            break;
+        }
+
+        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
+            n_remain = params.n_predict;
+            is_interacting = true;
+        }
+    }
+
+    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
+        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+    }
+
+    llama_print_timings(ctx);
+    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+
+    if (ctx_guidance) { llama_free(ctx_guidance); }
+    llama_free(ctx);
+    llama_free_model(model);
+
+    if (grammar != NULL) {
+        llama_grammar_free(grammar);
+    }
+    llama_backend_free();
+
+    return 0;
+}
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@ -14,10 +14,14 @@
 struct ggml_mpi_context {
    int rank;
    int size;
+    MPI_Comm comm;
+    int layer_start;
+    int layer_end;
 };

 void ggml_mpi_backend_init(void) {
-    MPI_Init(NULL, NULL);
+    int ret;
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &ret);
 }

 void ggml_mpi_backend_free(void) {
@ -29,10 +33,19 @@ struct ggml_mpi_context * ggml_mpi_init(void) {

    MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
    MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
+    ctx->comm = MPI_COMM_WORLD;

    return ctx;
 }

+struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) {
+    struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context));
+    MPI_Comm_split(ctx->comm, color, key, &newCtx->comm);
+    MPI_Comm_rank(newCtx->comm, &newCtx->rank);
+    MPI_Comm_size(newCtx->comm, &newCtx->size);
+    return newCtx;
+}
+
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
    free(ctx);
 }
@ -41,19 +54,21 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
    return ctx->rank;
 }

+int ggml_mpi_size(struct ggml_mpi_context * ctx) {
+    return ctx->size;
+}
+
 void ggml_mpi_eval_init(
        struct ggml_mpi_context * ctx_mpi,
                            int * n_tokens,
                            int * n_past,
                            int * n_threads) {
-    UNUSED(ctx_mpi);

-    // synchronize the worker node parameters with the root node
-    MPI_Barrier(MPI_COMM_WORLD);

-    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Barrier(ctx_mpi->comm);
+
+    MPI_Bcast(n_tokens,  1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_past,    1, MPI_INT, 0, ctx_mpi->comm);
 }

 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
@ -73,7 +88,8 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
    return -1;
 }

-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
+
+static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_Comm comm) {
    MPI_Datatype mpi_type;

    switch (t->type) {
@ -82,11 +98,11 @@ static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
        default: GGML_ASSERT(false && "not implemented");
    }

-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
+    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, comm);
    GGML_ASSERT(retval == MPI_SUCCESS);
 }

-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
+static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_Comm comm) {
    MPI_Datatype mpi_type;

    switch (t->type) {
@ -97,10 +113,72 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {

    MPI_Status status; UNUSED(status);

-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
    GGML_ASSERT(retval == MPI_SUCCESS);
 }

+uint16_t** ggml_mpi_split_range(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t start,
+    uint16_t end,
+    float node_weights[]
+) {
+    // Splits the range given by start and end
+    // over the available nodes. This implementation
+    // assumes that node 0 handles the final part of the range
+    // while node 1 handles the beginning, to form a ring pipeline
+
+    // Only node 0 deals with the device splits, other nodes
+    // get the splits from the scatter layers operation
+
+    if (ctx_mpi->rank != 0) {
+        return NULL;
+    }
+
+    uint16_t range_length = end - start + 1;
+    uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size);
+    for (int i = 0; i < ctx_mpi->size; i++) {
+        ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
+    }
+    uint16_t next_layer = 0;
+    for (int i=1; i < ctx_mpi->size; i++) {
+        ranges[i][0] = next_layer;
+        ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
+        next_layer = ranges[i][1];
+    }
+
+    ranges[0][0] = next_layer;
+    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
+    return ranges;
+
+}
+
+void ggml_mpi_scatter_layers(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t ** layer_ranges
+) {
+    // Layer ranges is a 2d array with the first dimension
+    // having a length of the number of nodes and the second
+    // dimension having a length of 2. The inner arrays contain
+    // the start and end layer ID for a node.
+    uint16_t flattened_ranges[ctx_mpi->size * 2];
+
+    if (layer_ranges != NULL) {
+        for (int i = 0; i < ctx_mpi->size * 2; i += 2) {
+            fprintf(stderr, "In iteration %d\n", i);
+            flattened_ranges[i] = layer_ranges[i/2][0];
+            fprintf(stderr, "Got first element\n");
+            flattened_ranges[i + 1] = layer_ranges[i/2][1];
+        }
+    }
+
+    uint16_t received_range[2];
+    MPI_Scatter(flattened_ranges, 2, MPI_UINT16_T, received_range, 2, MPI_UINT16_T, 0, ctx_mpi->comm);
+    ctx_mpi->layer_start = received_range[0];
+    ctx_mpi->layer_end = received_range[1];
+    fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
+}
+
 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_compute_pre(
        struct ggml_mpi_context * ctx_mpi,
@ -134,29 +212,36 @@ void ggml_mpi_graph_compute_pre(
    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
    // node 0:   [(n-1) * n_per_node,            n_nodes)
    //
+
+
+
    if (mpi_rank > 0) {
        if (mpi_rank == 1) {
            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0);
+            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
        } else {
            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
        }
    } else if (mpi_size > 1) {
        // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1);
+        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);

        // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
    }

    {
+
+
        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;

        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;

-        const int il0 =               (mpi_idx + 0) * n_per_node;
-        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+        //const int il0 =               (mpi_idx + 0) * n_per_node;
+        //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+        int il0 = ctx_mpi->layer_start;
+        int il1 = MIN(n_layers, ctx_mpi->layer_end);

        char name_l0[GGML_MAX_NAME];
        char name_l1[GGML_MAX_NAME];
@ -196,7 +281,6 @@ void ggml_mpi_graph_compute_pre(

        gf->n_nodes = idx_l1 - idx_l0;

-        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
    }
 }

@ -211,6 +295,6 @@ void ggml_mpi_graph_compute_post(

    // send the output data to the next node
    if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
    }
 }
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@ -1,4 +1,5 @@
 #pragma once
+#include <stdint.h>

 struct ggml_context;
 struct ggml_tensor;
@ -14,15 +15,27 @@ void ggml_mpi_backend_init(void);
 void ggml_mpi_backend_free(void);

 struct ggml_mpi_context * ggml_mpi_init(void);
+struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key);
 void ggml_mpi_free(struct ggml_mpi_context * ctx);

 int ggml_mpi_rank(struct ggml_mpi_context * ctx);
-
+int ggml_mpi_size(struct ggml_mpi_context * ctx);
 void ggml_mpi_eval_init(
        struct ggml_mpi_context * ctx_mpi,
                            int * n_tokens,
                            int * n_past,
                            int * n_threads);
+uint16_t** ggml_mpi_split_range(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t start,
+    uint16_t end,
+    float node_weights[]
+);
+
+void ggml_mpi_scatter_layers(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t ** layer_ranges
+);

 void ggml_mpi_graph_compute_pre(
        struct ggml_mpi_context * ctx_mpi,
--- a/llama.cpp
+++ b/llama.cpp
@ -1098,6 +1098,10 @@ struct llama_mmap {
        int flags = MAP_SHARED;
        // prefetch/readahead impairs performance on NUMA systems
        if (numa)  { prefetch = 0; }
+
+#ifdef GGML_USE_MPI
+        prefetch = 0;
+#endif
 #ifdef __linux__
        // advise the kernel to read the file sequentially (increases readahead)
        if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@ -1106,6 +1110,7 @@ struct llama_mmap {
        }
        if (prefetch) { flags |= MAP_POPULATE; }
 #endif
+
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        if (addr == MAP_FAILED) { // NOLINT
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
@ -12697,9 +12702,7 @@ void llama_backend_init(void) {
        ggml_free(ctx);
    }

-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_init();
-#endif
+
 }

 void llama_numa_init(enum ggml_numa_strategy numa) {
@ -13075,20 +13078,21 @@ struct llama_context * llama_new_context_with_model(
 #ifdef GGML_USE_MPI
    ctx->ctx_mpi = ggml_mpi_init();

-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
-        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        // TODO: needs fix after #3228
-        GGML_ASSERT(false && "not implemented");
-        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
-        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
-        llama_backend_free();
-        exit(1);
-    }
 #endif

    return ctx;
 }

+void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != device_weights.size()) {
+        GGML_ASSERT(false && "Must have same number of split percentages as devices");
+    }
+    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights.data());
+    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+#endif
+}
+
 void llama_free(struct llama_context * ctx) {
    delete ctx;
 }
--- a/llama.h
+++ b/llama.h
@ -8,7 +8,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
-
+#include <vector>
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
@ -358,6 +358,8 @@ extern "C" {
                             const char * path_model,
            struct llama_model_params     params);

+    LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights);
+
    LLAMA_API void llama_free_model(struct llama_model * model);

    LLAMA_API struct llama_context * llama_new_context_with_model(