From 3ca1ca01827b2f43546420a8da3a3d616eb00f6e Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 24 Sep 2023 10:05:34 -0500
Subject: [PATCH 01/35] Refactor MPI for heterogenous cluster support.

Adds support for different options and number of layers
per node.

The per-node options are implemented as parsing
command-line options from a file instead of from the
command-line itself. This allows each node to have its own
version of this options file.

The different number of layers per-node is implemented
as a new option, `mpi-layer-split`, that takes
a list of percentages. These percentages are used to calculate
the range of layers to delegate to each node. The ranges
are calculated on the head node and then scattered to the other
nodes to maintain a single source of truth.
---
 common/common.cpp           |  17 +
 common/common.h             |   1 +
 examples/mpi/CMakeLists.txt |   8 +
 examples/mpi/README.md      |  80 +++
 examples/mpi/mpi.cpp        | 945 ++++++++++++++++++++++++++++++++++++
 ggml-mpi.c                  | 122 ++++-
 ggml-mpi.h                  |  15 +-
 llama.cpp                   |  28 +-
 llama.h                     |   4 +-
 9 files changed, 1187 insertions(+), 33 deletions(-)
 create mode 100644 examples/mpi/CMakeLists.txt
 create mode 100644 examples/mpi/README.md
 create mode 100644 examples/mpi/mpi.cpp
diff --git a/common/common.cpp b/common/common.cpp
index 58fbd05aa..0d9b19cbe 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -663,6 +663,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
 #endif // GGML_USE_CUBLAS_SYCL
 
+        } else if (arg == "--mpi-layer-split") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.mpi_layer_split.resize(split_arg.size());
+            for (size_t node = 0; node < split_arg.size(); ++node) {
+                params.mpi_layer_split[node] = std::stof(split_arg[node]);
+            }
+
+
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
                 invalid_param = true;
diff --git a/common/common.h b/common/common.h
index d250eef8b..f3b913d9d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -62,6 +62,7 @@ struct gpt_params {
     int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
     llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    std::vector<float> mpi_layer_split      = {1.0}; // list of percentages of the total number of layers
     int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
     float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
diff --git a/examples/mpi/CMakeLists.txt b/examples/mpi/CMakeLists.txt
new file mode 100644
index 000000000..07d83b61d
--- /dev/null
+++ b/examples/mpi/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET mpi)
+add_executable(${TARGET} mpi.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/mpi/README.md b/examples/mpi/README.md
new file mode 100644
index 000000000..44a047915
--- /dev/null
+++ b/examples/mpi/README.md
@@ -0,0 +1,80 @@
+# llama.cpp/example/mpi
+
+This example program allows you to use various LLaMA language models in an easy and efficient way across an MPI cluster.
+It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Common Options](#common-options)
+
+## Quick Start
+
+To get started right away, write the following to a file on each node, making sure to use the correct path for the model you have:
+```bash
+--mpi-layer-split 0.8,0.2 -t 4 -m ~/llm-local/codellama-7b.Q3_K_M.gguf --color -c 512 --temp 0.0 --repeat_penalty 1.0 -n 128 -p "double fast_inverse_square_root(double x"
+```
+
+Each node may have different options, currently they must have the same number of arguments to the mpi-layer-split option and the same
+model path, but that will eventually be synchronized from the head node.
+
+Next, write the hostsfile on the head node. Make sure there is only one slot on each node.
+
+Finally, run the following command on the head node to start the program across the cluster:
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+mpirun -hostfile hostsfile -mca orte_keep_fqdn_hostnames t --bind-to none ./mpi options.txt
+```
+
+Where `hostsfile` is the file containing the cluster hostname configuration and `options.txt` is the path
+where each node can find its own options. Storing the model on a network filesystem has not yet been
+tested and optimized for.
+
+#### Windows:
+Not supported currently.
+
+For an interactive experience, try this command:
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
+'User: Hi
+AI: Hello. I am an AI chatbot. Would you like to talk?
+User: Sure!
+AI: What would you like to talk about?
+User:'
+```
+
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+```
+
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `mpi` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `--mpi-layer-split`: Set the percentage of layers to distribute to each node. Must have the same number of arguments as the number of nodes in the cluster. Only the layer split percentages passed to the head node are used, they are scattered to all other nodes in the cluster.
diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
new file mode 100644
index 000000000..8d14d8e61
--- /dev/null
+++ b/examples/mpi/mpi.cpp
@@ -0,0 +1,945 @@
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "common.h"
+#include "console.h"
+#include "llama.h"
+#include "build-info.h"
+#include "grammar-parser.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <wordexp.h>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+
+static llama_context           ** g_ctx;
+static llama_model             ** g_model;
+static gpt_params               * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream       * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
+static bool is_interacting = false;
+
+void write_logfile(
+    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
+
+    if (params.logdir.empty()) {
+        return;
+    }
+
+    const std::string timestamp = get_sortable_timestamp();
+
+    const bool success = create_directory_with_parents(params.logdir);
+    if (!success) {
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                __func__, params.logdir.c_str());
+        return;
+    }
+
+    const std::string logfile_path = params.logdir + timestamp + ".yml";
+    FILE * logfile = fopen(logfile_path.c_str(), "w");
+
+    if (logfile == NULL) {
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        return;
+    }
+
+    fprintf(logfile, "binary: main\n");
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+
+    fprintf(logfile, "\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "# Generation Results #\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "\n");
+
+    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+
+    llama_dump_timing_info_yaml(logfile, ctx);
+    fclose(logfile);
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (!is_interacting) {
+            is_interacting = true;
+        } else {
+            console::cleanup();
+            printf("\n");
+            llama_print_timings(*g_ctx);
+            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            _exit(130);
+        }
+    }
+}
+#endif
+
+int main(int argc, char ** argv) {
+
+    gpt_params params;
+    g_params = &params;
+
+    if (argc > 2) {
+        fprintf(stderr, "Must only have one argument, the file to read options from.\n");
+        return 2;
+    }
+
+    std::string rawOptions = argv[0];
+    rawOptions += ' ';
+    std::ifstream optionsFile(argv[1]);
+    if (optionsFile.is_open()) {
+        std::ostringstream buf;
+        buf << optionsFile.rdbuf();
+        rawOptions += buf.str();
+        optionsFile.close();
+
+    } else {
+        fprintf(stderr, "Cannot open options file at path %s\n", argv[1]);
+        return 3;
+    }
+
+    rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
+
+    printf("%s", rawOptions.c_str());
+
+    wordexp_t  splitOptions;
+    wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
+    //char** loadedArgs = (char **) malloc(1 + sizeof(char*) * splitOptions.we_wordc);
+    //loadedArgs[0] = argv[0];
+    //memcpy(&loadedArgs[1], splitOptions.we_wordv, sizeof(char*) * splitOptions.we_wordc);
+    printf("Loaded argc: %d", splitOptions.we_wordc);
+    for (int i = 0; i < splitOptions.we_wordc; i++) {
+
+        printf(" %s", splitOptions.we_wordv[i]);
+    }
+    printf("\n");
+
+    if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {
+        wordfree(&splitOptions);
+        return 1;
+    }
+    wordfree(&splitOptions);
+
+    // save choice to use color for later
+    // (note for later: this is a slightly awkward choice)
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
+    if (params.perplexity) {
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
+    if (params.embedding) {
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
+    if (params.rope_freq_base != 10000.0) {
+        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 1.0) {
+        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    }
+
+    if (params.n_ctx > 2048) {
+        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_backend_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+    llama_context * ctx_guidance = NULL;
+    g_model = &model;
+    g_ctx = &ctx;
+
+    // load the model and apply lora adapter, if any
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (params.cfg_scale > 1.f) {
+        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+        ctx_guidance = llama_new_context_with_model(model, lparams);
+    }
+
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+
+    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
+    // uncomment the "used_mem" line in llama.cpp to see the results
+    if (params.mem_test) {
+        {
+            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
+
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
+        }
+
+        llama_print_timings(ctx);
+        llama_free(ctx);
+        llama_free_model(model);
+
+        return 0;
+    }
+
+    // export the cgraph and exit
+    if (params.export_cgraph) {
+        llama_eval_export(ctx, "llama.ggml");
+        llama_free(ctx);
+        llama_free_model(model);
+
+        return 0;
+    }
+    llama_split_layers_weighted(ctx, params.mpi_layer_split);
+
+    std::string path_session = params.path_prompt_cache;
+    std::vector<llama_token> session_tokens;
+
+    if (!path_session.empty()) {
+        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+
+        // fopen to check for existing session
+        FILE * fp = std::fopen(path_session.c_str(), "rb");
+        if (fp != NULL) {
+            std::fclose(fp);
+
+            session_tokens.resize(params.n_ctx);
+            size_t n_token_count_out = 0;
+            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
+            session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);
+
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+        } else {
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+        }
+    }
+
+    // Add BOS if SPM tokenizer
+    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+
+    // tokenize the prompt
+    std::vector<llama_token> embd_inp;
+
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+    } else {
+        embd_inp = session_tokens;
+    }
+
+    // Should not run without any tokens
+    if (embd_inp.empty()) {
+        embd_inp.push_back(llama_token_bos(ctx));
+    }
+
+    // Tokenize negative prompt
+    std::vector<llama_token> guidance_inp;
+    int guidance_offset = 0;
+    int original_prompt_len = 0;
+    if (ctx_guidance) {
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        original_prompt_len = original_inp.size();
+        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+    }
+
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        return 1;
+    }
+
+    // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (session_tokens.size()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        }
+    }
+
+    // if we will use the cache for the full prompt without reaching the end of the cache, force
+    // reevaluation of the last token token to recalculate the cached logits
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+            session_tokens.size() > embd_inp.size()) {
+        session_tokens.resize(embd_inp.size() - 1);
+    }
+
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
+        params.n_keep = (int)embd_inp.size();
+    }
+
+    // prefix & suffix for instruct mode
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+
+    // in instruct mode, we inject a prefix and a suffix to each input by the user
+    if (params.instruct) {
+        params.interactive_first = true;
+        params.antiprompt.push_back("### Instruction:\n\n");
+    }
+
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
+        params.interactive = true;
+    }
+
+    if (params.verbose_prompt) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+        }
+
+        if (ctx_guidance) {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            for (int i = 0; i < (int) guidance_inp.size(); i++) {
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+            }
+        }
+
+        if (params.n_keep > 0) {
+        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.n_keep; i++) {
+                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            }
+            fprintf(stderr, "'\n");
+        }
+        fprintf(stderr, "\n");
+    }
+
+    if (params.interactive) {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+
+        if (params.antiprompt.size()) {
+            for (auto antiprompt : params.antiprompt) {
+                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+            }
+        }
+
+        if (params.input_prefix_bos) {
+            fprintf(stderr, "Input prefix with BOS\n");
+        }
+
+        if (!params.input_prefix.empty()) {
+            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+        }
+
+        if (!params.input_suffix.empty()) {
+            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+        }
+    }
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    fprintf(stderr, "\n\n");
+
+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *             grammar = NULL;
+    if (!params.grammar.empty()) {
+        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        // will be empty (default) if there are parse errors
+        if (parsed_grammar.rules.empty()) {
+            return 1;
+        }
+        fprintf(stderr, "%s: grammar:\n", __func__);
+        grammar_parser::print_grammar(stderr, parsed_grammar);
+        fprintf(stderr, "\n");
+
+        {
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            }
+        }
+
+        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+        grammar = llama_grammar_init(
+            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    }
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
+    if (params.interactive) {
+        const char *control_message;
+        if (params.multiline_input) {
+            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to LLaMa.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n"
+                              " - If you want to submit another line, end your input with '\\'.\n";
+        }
+        fprintf(stderr, "== Running in interactive mode. ==\n"
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+               " - Press Ctrl+C to interject at any time.\n"
+#endif
+               "%s\n", control_message);
+
+        is_interacting = params.interactive_first;
+    }
+
+    bool is_antiprompt        = false;
+    bool input_echo           = true;
+    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
+
+    int n_past             = 0;
+    int n_remain           = params.n_predict;
+    int n_consumed         = 0;
+    int n_session_consumed = 0;
+    int n_past_guidance    = 0;
+
+    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
+    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
+    std::ostringstream output_ss;     g_output_ss     = &output_ss;
+
+    // the first thing we will do is to output the prompt, so set color accordingly
+    console::set_display(console::prompt);
+
+    std::vector<llama_token> embd;
+    std::vector<llama_token> embd_guidance;
+
+    // do one empty run to warm up the model
+    {
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_reset_timings(ctx);
+    }
+
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
+        // predict
+        if (embd.size() > 0) {
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            auto max_embd_size = n_ctx - 4;
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int)embd.size() > max_embd_size) {
+                auto skipped_tokens = embd.size() - max_embd_size;
+                console::set_display(console::error);
+                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console::set_display(console::reset);
+                fflush(stdout);
+                embd.resize(max_embd_size);
+            }
+
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                if (params.n_predict == -2) {
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    break;
+                }
+
+                const int n_left = n_past - params.n_keep;
+                // always keep the first token - BOS
+                n_past = std::max(1, params.n_keep);
+                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
+
+                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+
+                // stop saving session if we run out of context
+                path_session.clear();
+
+                //printf("\n---\n");
+                //printf("resetting: '");
+                //for (int i = 0; i < (int) embd.size(); i++) {
+                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
+                //}
+                //printf("'\n");
+                //printf("\n---\n");
+            }
+
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+
+                    n_past++;
+                    n_session_consumed++;
+
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+
+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+
+            if (ctx_guidance) {
+                int input_size = 0;
+                llama_token* input_buf = NULL;
+
+                if (n_past_guidance < (int) guidance_inp.size()) {
+                    // Guidance context should have the same data with these modifications:
+                    //
+                    // * Replace the initial prompt
+                    // * Shift everything by guidance_offset
+                    embd_guidance = guidance_inp;
+                    if (embd.begin() + original_prompt_len < embd.end()) {
+                        embd_guidance.insert(
+                            embd_guidance.end(),
+                            embd.begin() + original_prompt_len,
+                            embd.end()
+                        );
+                    }
+
+                    input_buf = embd_guidance.data();
+                    input_size = embd_guidance.size();
+                    //fprintf(stderr, "\n---------------------\n");
+                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
+                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
+                    //}
+                    //fprintf(stderr, "\n---------------------\n");
+                } else {
+                    input_buf = embd.data();
+                    input_size = embd.size();
+                }
+
+                for (int i = 0; i < input_size; i += params.n_batch) {
+                    int n_eval = std::min(input_size - i, params.n_batch);
+                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past_guidance += n_eval;
+                }
+            }
+
+            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    return 1;
+                }
+                n_past += n_eval;
+            }
+
+            if (embd.size() > 0 && !path_session.empty()) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
+            }
+        }
+
+        embd.clear();
+        embd_guidance.clear();
+
+        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            // out of user input, sample next token
+            const float   temp            = params.temp;
+            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p           = params.top_p;
+            const float   tfs_z           = params.tfs_z;
+            const float   typical_p       = params.typical_p;
+            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+            const float   repeat_penalty  = params.repeat_penalty;
+            const float   alpha_presence  = params.presence_penalty;
+            const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat        = params.mirostat;
+            const float   mirostat_tau    = params.mirostat_tau;
+            const float   mirostat_eta    = params.mirostat_eta;
+            const bool    penalize_nl     = params.penalize_nl;
+
+            // optionally save the session on first sample (for faster prompt loading next time)
+            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
+                need_to_save_session = false;
+                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+            }
+
+            llama_token id = 0;
+
+            {
+                auto logits  = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);
+
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
+                }
+
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                }
+
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                if (ctx_guidance) {
+                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
+                }
+
+                // Apply penalties
+                float nl_logit = logits[llama_token_nl(ctx)];
+                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+                llama_sample_repetition_penalty(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, repeat_penalty);
+                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, alpha_frequency, alpha_presence);
+                if (!penalize_nl) {
+                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
+                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
+                            candidates_p.data[idx].logit = nl_logit;
+                            break;
+                        }
+                    }
+                }
+
+                if (grammar != NULL) {
+                    llama_sample_grammar(ctx, &candidates_p, grammar);
+                }
+
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    if (mirostat == 1) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        const int mirostat_m = 100;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                    } else if (mirostat == 2) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                    } else {
+                        // Temperature sampling
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token(ctx, &candidates_p);
+                    }
+                }
+                // printf("`%d`", candidates_p.size);
+
+                if (grammar != NULL) {
+                    llama_grammar_accept_token(ctx, grammar, id);
+                }
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(id);
+            }
+
+            // add it to the context
+            embd.push_back(id);
+
+            // echo this to console
+            input_echo = true;
+
+            // decrement remaining sampling budget
+            --n_remain;
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[n_consumed]);
+                ++n_consumed;
+                if ((int) embd.size() >= params.n_batch) {
+                    break;
+                }
+            }
+        }
+
+        // display text
+        if (input_echo) {
+            for (auto id : embd) {
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                printf("%s", token_str.c_str());
+
+                if (embd.size() > 1) {
+                    input_tokens.push_back(id);
+                } else {
+                    output_tokens.push_back(id);
+                    output_ss << token_str;
+                }
+            }
+            fflush(stdout);
+        }
+        // reset color to default if we there is no pending user input
+        if (input_echo && (int)embd_inp.size() == n_consumed) {
+            console::set_display(console::reset);
+        }
+
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {
+
+            // check for reverse prompt
+            if (params.antiprompt.size()) {
+                std::string last_output;
+                for (auto id : last_n_tokens) {
+                    last_output += llama_token_to_piece(ctx, id);
+                }
+
+                is_antiprompt = false;
+                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
+                for (std::string & antiprompt : params.antiprompt) {
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                        : 0;
+
+                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                            console::set_display(console::user_input);
+                        }
+                        is_antiprompt = true;
+                        fflush(stdout);
+                        break;
+                    }
+                }
+            }
+
+            // deal with end of text token in interactive mode
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
+                if (params.interactive) {
+                    if (params.antiprompt.size() != 0) {
+                        // tokenize and inject first reverse prompt
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                        is_antiprompt = true;
+                    }
+
+                    is_interacting = true;
+                    printf("\n");
+                    console::set_display(console::user_input);
+                    fflush(stdout);
+                } else if (params.instruct) {
+                    is_interacting = true;
+                }
+            }
+
+            if (n_past > 0 && is_interacting) {
+                if (params.instruct) {
+                    printf("\n> ");
+                }
+
+                if (params.input_prefix_bos) {
+                    embd_inp.push_back(llama_token_bos(ctx));
+                }
+
+                std::string buffer;
+                if (!params.input_prefix.empty()) {
+                    buffer += params.input_prefix;
+                    printf("%s", buffer.c_str());
+                }
+
+                std::string line;
+                bool another_line = true;
+                do {
+                    another_line = console::readline(line, params.multiline_input);
+                    buffer += line;
+                } while (another_line);
+
+                // done taking input, reset color
+                console::set_display(console::reset);
+
+                // Add tokens to embd only if the input buffer is non-empty
+                // Entering a empty line lets the user pass control back
+                if (buffer.length() > 1) {
+                    // append input suffix if any
+                    if (!params.input_suffix.empty()) {
+                        buffer += params.input_suffix;
+                        printf("%s", params.input_suffix.c_str());
+                    }
+
+                    const size_t original_size = embd_inp.size();
+
+                    // instruct mode: insert instruction prefix
+                    if (params.instruct && !is_antiprompt) {
+                        n_consumed = embd_inp.size();
+                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
+                    }
+
+                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+
+                    // instruct mode: insert response suffix
+                    if (params.instruct) {
+                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                    }
+
+                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
+                        const llama_token token = embd_inp[i];
+                        output_tokens.push_back(token);
+                        output_ss << llama_token_to_piece(ctx, token);
+                    }
+
+                    n_remain -= line_inp.size();
+                }
+
+                input_echo = false; // do not echo this again
+            }
+
+            if (n_past > 0) {
+                if (is_interacting) {
+                    // reset grammar state if we're restarting generation
+                    if (grammar != NULL) {
+                        llama_grammar_free(grammar);
+
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
+                        grammar = llama_grammar_init(
+                            grammar_rules.data(), grammar_rules.size(),
+                            parsed_grammar.symbol_ids.at("root"));
+                    }
+                }
+                is_interacting = false;
+            }
+        }
+
+        // end of text token
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+            fprintf(stderr, " [end of text]\n");
+            break;
+        }
+
+        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
+            n_remain = params.n_predict;
+            is_interacting = true;
+        }
+    }
+
+    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
+        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+    }
+
+    llama_print_timings(ctx);
+    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+
+    if (ctx_guidance) { llama_free(ctx_guidance); }
+    llama_free(ctx);
+    llama_free_model(model);
+
+    if (grammar != NULL) {
+        llama_grammar_free(grammar);
+    }
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/ggml-mpi.c b/ggml-mpi.c
index ae176d707..a90897829 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -14,10 +14,14 @@
 struct ggml_mpi_context {
     int rank;
     int size;
+    MPI_Comm comm;
+    int layer_start;
+    int layer_end;
 };
 
 void ggml_mpi_backend_init(void) {
-    MPI_Init(NULL, NULL);
+    int ret;
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &ret);
 }
 
 void ggml_mpi_backend_free(void) {
@@ -29,10 +33,19 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
 
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
+    ctx->comm = MPI_COMM_WORLD;
 
     return ctx;
 }
 
+struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) {
+    struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context));
+    MPI_Comm_split(ctx->comm, color, key, &newCtx->comm);
+    MPI_Comm_rank(newCtx->comm, &newCtx->rank);
+    MPI_Comm_size(newCtx->comm, &newCtx->size);
+    return newCtx;
+}
+
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
     free(ctx);
 }
@@ -41,19 +54,21 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
     return ctx->rank;
 }
 
+int ggml_mpi_size(struct ggml_mpi_context * ctx) {
+    return ctx->size;
+}
+
 void ggml_mpi_eval_init(
         struct ggml_mpi_context * ctx_mpi,
                             int * n_tokens,
                             int * n_past,
                             int * n_threads) {
-    UNUSED(ctx_mpi);
 
-    // synchronize the worker node parameters with the root node
-    MPI_Barrier(MPI_COMM_WORLD);
 
-    MPI_Bcast(n_tokens,  1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_past,    1, MPI_INT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Barrier(ctx_mpi->comm);
+
+    MPI_Bcast(n_tokens,  1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_past,    1, MPI_INT, 0, ctx_mpi->comm);
 }
 
 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
@@ -73,7 +88,8 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
     return -1;
 }
 
-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
+
+static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -82,11 +98,11 @@ static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
         default: GGML_ASSERT(false && "not implemented");
     }
 
-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
+    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, comm);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
+static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -97,10 +113,72 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
 
     MPI_Status status; UNUSED(status);
 
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
+uint16_t** ggml_mpi_split_range(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t start,
+    uint16_t end,
+    float node_weights[]
+) {
+    // Splits the range given by start and end
+    // over the available nodes. This implementation
+    // assumes that node 0 handles the final part of the range
+    // while node 1 handles the beginning, to form a ring pipeline
+
+    // Only node 0 deals with the device splits, other nodes
+    // get the splits from the scatter layers operation
+
+    if (ctx_mpi->rank != 0) {
+        return NULL;
+    }
+
+    uint16_t range_length = end - start + 1;
+    uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size);
+    for (int i = 0; i < ctx_mpi->size; i++) {
+        ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
+    }
+    uint16_t next_layer = 0;
+    for (int i=1; i < ctx_mpi->size; i++) {
+        ranges[i][0] = next_layer;
+        ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
+        next_layer = ranges[i][1];
+    }
+
+    ranges[0][0] = next_layer;
+    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
+    return ranges;
+
+}
+
+void ggml_mpi_scatter_layers(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t ** layer_ranges
+) {
+    // Layer ranges is a 2d array with the first dimension
+    // having a length of the number of nodes and the second
+    // dimension having a length of 2. The inner arrays contain
+    // the start and end layer ID for a node.
+    uint16_t flattened_ranges[ctx_mpi->size * 2];
+
+    if (layer_ranges != NULL) {
+        for (int i = 0; i < ctx_mpi->size * 2; i += 2) {
+            fprintf(stderr, "In iteration %d\n", i);
+            flattened_ranges[i] = layer_ranges[i/2][0];
+            fprintf(stderr, "Got first element\n");
+            flattened_ranges[i + 1] = layer_ranges[i/2][1];
+        }
+    }
+
+    uint16_t received_range[2];
+    MPI_Scatter(flattened_ranges, 2, MPI_UINT16_T, received_range, 2, MPI_UINT16_T, 0, ctx_mpi->comm);
+    ctx_mpi->layer_start = received_range[0];
+    ctx_mpi->layer_end = received_range[1];
+    fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
+}
+
 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
@@ -134,29 +212,36 @@ void ggml_mpi_graph_compute_pre(
     // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
     // node 0:   [(n-1) * n_per_node,            n_nodes)
     //
+
+
+
     if (mpi_rank > 0) {
         if (mpi_rank == 1) {
             // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0);
+            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
         } else {
             // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
         }
     } else if (mpi_size > 1) {
         // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1);
+        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
 
         // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1);
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
     }
 
     {
+
+
         const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
 
         const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
 
-        const int il0 =               (mpi_idx + 0) * n_per_node;
-        const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+        //const int il0 =               (mpi_idx + 0) * n_per_node;
+        //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
+        int il0 = ctx_mpi->layer_start;
+        int il1 = MIN(n_layers, ctx_mpi->layer_end);
 
         char name_l0[GGML_MAX_NAME];
         char name_l1[GGML_MAX_NAME];
@@ -196,7 +281,6 @@ void ggml_mpi_graph_compute_pre(
 
         gf->n_nodes = idx_l1 - idx_l0;
 
-        //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
     }
 }
 
@@ -211,6 +295,6 @@ void ggml_mpi_graph_compute_post(
 
     // send the output data to the next node
     if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
+        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
     }
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index eda119d44..2224943dc 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <stdint.h>
 
 struct ggml_context;
 struct ggml_tensor;
@@ -14,15 +15,27 @@ void ggml_mpi_backend_init(void);
 void ggml_mpi_backend_free(void);
 
 struct ggml_mpi_context * ggml_mpi_init(void);
+struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key);
 void ggml_mpi_free(struct ggml_mpi_context * ctx);
 
 int ggml_mpi_rank(struct ggml_mpi_context * ctx);
-
+int ggml_mpi_size(struct ggml_mpi_context * ctx);
 void ggml_mpi_eval_init(
         struct ggml_mpi_context * ctx_mpi,
                             int * n_tokens,
                             int * n_past,
                             int * n_threads);
+uint16_t** ggml_mpi_split_range(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t start,
+    uint16_t end,
+    float node_weights[]
+);
+
+void ggml_mpi_scatter_layers(
+    struct ggml_mpi_context * ctx_mpi,
+    uint16_t ** layer_ranges
+);
 
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
diff --git a/llama.cpp b/llama.cpp
index 2c3841974..fa170e7df 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1098,6 +1098,10 @@ struct llama_mmap {
         int flags = MAP_SHARED;
         // prefetch/readahead impairs performance on NUMA systems
         if (numa)  { prefetch = 0; }
+
+#ifdef GGML_USE_MPI
+        prefetch = 0;
+#endif
 #ifdef __linux__
         // advise the kernel to read the file sequentially (increases readahead)
         if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@@ -1106,6 +1110,7 @@ struct llama_mmap {
         }
         if (prefetch) { flags |= MAP_POPULATE; }
 #endif
+
         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
         if (addr == MAP_FAILED) { // NOLINT
             throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
@@ -12697,9 +12702,7 @@ void llama_backend_init(void) {
         ggml_free(ctx);
     }
 
-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_init();
-#endif
+
 }
 
 void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -13075,20 +13078,21 @@ struct llama_context * llama_new_context_with_model(
 #ifdef GGML_USE_MPI
     ctx->ctx_mpi = ggml_mpi_init();
 
-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
-        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        // TODO: needs fix after #3228
-        GGML_ASSERT(false && "not implemented");
-        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
-        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
-        llama_backend_free();
-        exit(1);
-    }
 #endif
 
     return ctx;
 }
 
+void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != device_weights.size()) {
+        GGML_ASSERT(false && "Must have same number of split percentages as devices");
+    }
+    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights.data());
+    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+#endif
+}
+
 void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
diff --git a/llama.h b/llama.h
index 90aa5372e..0a13b037d 100644
--- a/llama.h
+++ b/llama.h
@@ -8,7 +8,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
-
+#include <vector>
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
@@ -358,6 +358,8 @@ extern "C" {
                              const char * path_model,
             struct llama_model_params     params);
 
+    LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights);
+
     LLAMA_API void llama_free_model(struct llama_model * model);
 
     LLAMA_API struct llama_context * llama_new_context_with_model(

From 40a810923a40f4c9f6e6802fede980a08663e081 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 24 Sep 2023 23:34:00 -0500
Subject: [PATCH 02/35] Add documentation for ggml-mpi functions

---
 ggml-mpi.c |   1 +
 ggml-mpi.h | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/ggml-mpi.c b/ggml-mpi.c
index a90897829..cef5ca6da 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -47,6 +47,7 @@ struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int
 }
 
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
+    MPI_Comm_free(ctx->comm);
     free(ctx);
 }
 
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 2224943dc..7eeb3856f 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -9,22 +9,133 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
+/**
+ * The context used for MPI operations,
+ * a program may make use of more than one
+ * context but must always have at least one.
+ *
+ * The context stores required information like the
+ * node rank and a communicator to use for MPI operations.
+ * A context is guaranteed to be internally consistent,
+ * meaning that a context's stored rank is valid within
+ * the context's communicator.
+ */
 struct ggml_mpi_context;
 
+
+/**
+ * Initialize the MPI library and the GGML MPI backend.
+ * Calling more than once during the lifetime of the program
+ * leads to undefined behavior. This function must be called before
+ * any MPI operations.
+ */
 void ggml_mpi_backend_init(void);
+
+/**
+ * Frees the MPI backend, must be called only once at termination
+ * of the program. No MPI operations may be completed after calling this function,
+ * and attempting to do so will lead to undefined behavior.
+ */
 void ggml_mpi_backend_free(void);
 
+/**
+ * Construct a new MPI context using the MPI_WORLD
+ * communicator. This is useful only to create the
+ * initial context, as calling multiple times
+ * will only create effective copies of the same data.
+ *
+ * @return A context for us in the global communicator.
+ */
 struct ggml_mpi_context * ggml_mpi_init(void);
+
+/**
+ * Create a new context by splitting the given context's
+ * communicator, creating a "sub-communicator." This is a collective
+ * operation and must be performed by all nodes within the same communicator.
+ * The color and key have the same meaning as in MPI_Comm_split(), i.e.
+ * the color is used to determine the sub-communicator this node will belong to,
+ * and the key is the relative rank of this node in the new communicator.
+ *
+ * An example: if a node passes a color of 1, and a different node passes a color of 2,
+ * the nodes will belong to two different sub-communicators. If two nodes pass the same
+ * color, then their ranks will be ordered by the order of their keys. If they pass the same
+ * key, then the tie will be broken by the nodes' ranks in the old communicator.
+ *
+ * The communicator used by the given context remains entirely valid, so it is advisable
+ * to store both the old and new contexts. This allows an application to
+ * select at runtime which communicator to perform MPI operations with. An example
+ * would be to segregate the nodes into multiple domains categorized by the functions
+ * they perform, and use the original context to broadcast to all nodes in the cluster.
+ *
+ * @param ctx The context containing the communicator to split.
+ * @param color The sub-communicator that this node will belong to.
+ * @param key The relative rank of this node in the new communicator.
+ * @return A new context with all values referencing the newly-created communicator.
+ */
 struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key);
+
+/**
+ * Frees the given context, including the communicator. No MPI
+ * operations besides ggml_mpi_backend_freee(void) should be executed after
+ * running this function.
+ *
+ * @param ctx The context to free.
+ */
 void ggml_mpi_free(struct ggml_mpi_context * ctx);
 
+/**
+ * Get the rank of this node in the given context's communicator.
+ *
+ * @param ctx The context to use to determine the rank with regards to.
+ * @return The rank of this node.
+ */
 int ggml_mpi_rank(struct ggml_mpi_context * ctx);
+
+/**
+ * Get the number of nodes that are a part of
+ * the communicator referenced by the given context.
+ *
+ * @param ctx The context containing the communicator used for this size check.
+ * @return The number of nodes that are a part of the given context's communicator.
+ */
 int ggml_mpi_size(struct ggml_mpi_context * ctx);
+
+/**
+ * Synchronize needed information among the nodes
+ * to prepare for running an evaluation iteration.
+ * This is a collective operation and all nodes must
+ * call this function. It will block until all
+ * nodes have entered it, to prevent any desync
+ * between nodes.
+ *
+ * @param ctx_mpi The context in which to prepare for evaluation.
+ * @param n_tokens A pointer to the n_tokens, which will be synchronized after this function.
+ * @param n_past A pointer to the n_past, which will be synchronized after this function.
+ * @param n_threads A pointer to the n_threads, which is unused currently.
+ */
 void ggml_mpi_eval_init(
         struct ggml_mpi_context * ctx_mpi,
                             int * n_tokens,
                             int * n_past,
                             int * n_threads);
+
+/**
+ * Split a range across all nodes within the given
+ * context, weighting the allocations by the given weights.
+ * The dimensions of the returned 2d array are (number of nodes in the context, 2).
+ * The first element in the inner array is the starting point of the range allocated
+ * to the node indicated by the index into the outer array,
+ * and the second element is the end point of the allocated range, inclusive.
+ *
+ * @param ctx_mpi The context used to determine the number of nodes
+ *                to split the range across.
+ * @param start The starting point of the range.
+ * @param end The end point of the range, inclusive.
+ * @param node_weights How to weight the allocations across the nodes,
+ *                     must sum to 1.0.
+ * @return A 2d array, the first dimension is the number of nodes in the context
+ *         and the second dimension is 2.
+ */
 uint16_t** ggml_mpi_split_range(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t start,
@@ -32,16 +143,42 @@ uint16_t** ggml_mpi_split_range(
     float node_weights[]
 );
 
+/**
+ * Scatter the layer ranges across all nodes
+ * in the given context. This is a collective operation
+ * and must be called by all nodes that are within the same
+ * communicator. The given layer ranges must be in the same
+ * format as created by the ggml_mpi_split_range().
+ *
+ * @param ctx_mpi The context to scatter the layers across.
+ * @param layer_ranges The pre-split ranges to scatter to the nodes.
+ */
 void ggml_mpi_scatter_layers(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t ** layer_ranges
 );
 
+/**
+ * Modify compute graph to only process allocated
+ * layers.
+ *
+ * @param ctx_mpi The context containing the allocated layer range.
+ * @param gf The compute graph to modify
+ * @param n_layers The number of layers in the model, used as an upper bound in the layer ranges.
+ */
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf,
                             int   n_layers);
 
+/**
+ * Sends the output tensor to the next node for processing
+ * of later layers.
+ *
+ * @param ctx_mpi The context to use for MPI operations.
+ * @param gf The graph used in the computations
+ * @param n_layers The number of layers in the model.
+ */
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf,

From 1e78fa4f912eb0f91983d9b7f249aed49cabe578 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 09:58:44 -0500
Subject: [PATCH 03/35] Add code comments in MPI

---
 examples/mpi/mpi.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 8d14d8e61..38ed93746 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -20,6 +20,8 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
+// TODO add Windows support
 #include <wordexp.h>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -116,10 +118,13 @@ int main(int argc, char ** argv) {
         return 2;
     }
 
+    // Manually add the path used to launch this program to the
+    // options
     std::string rawOptions = argv[0];
     rawOptions += ' ';
     std::ifstream optionsFile(argv[1]);
     if (optionsFile.is_open()) {
+        // Read in the options file, appending to the launch path
         std::ostringstream buf;
         buf << optionsFile.rdbuf();
         rawOptions += buf.str();
@@ -130,22 +135,21 @@ int main(int argc, char ** argv) {
         return 3;
     }
 
+    // wordexp doesn't work right if there's a trailing newline, so strip it
     rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
 
     printf("%s", rawOptions.c_str());
 
     wordexp_t  splitOptions;
     wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
-    //char** loadedArgs = (char **) malloc(1 + sizeof(char*) * splitOptions.we_wordc);
-    //loadedArgs[0] = argv[0];
-    //memcpy(&loadedArgs[1], splitOptions.we_wordv, sizeof(char*) * splitOptions.we_wordc);
-    printf("Loaded argc: %d", splitOptions.we_wordc);
+    fprintf(stderr, "Loaded arguments: ");
     for (int i = 0; i < splitOptions.we_wordc; i++) {
 
-        printf(" %s", splitOptions.we_wordv[i]);
+        fprintf(stderr, " %s", splitOptions.we_wordv[i]);
     }
-    printf("\n");
+    fprintf(stderr, "\n");
 
+    // Now we can parse like normal, but using the loaded options instead of the passed argv
     if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {
         wordfree(&splitOptions);
         return 1;

From 78112ab5c21495dbd56b7dd71f6d0c7d47be1edd Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 10:05:42 -0500
Subject: [PATCH 04/35] Remove mtest (#3177)

---
 examples/mpi/mpi.cpp | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 38ed93746..393ef1b2a 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -233,23 +233,6 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
-    // uncomment the "used_mem" line in llama.cpp to see the results
-    if (params.mem_test) {
-        {
-            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
-
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
-        }
-
-        llama_print_timings(ctx);
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
-    }
-
     // export the cgraph and exit
     if (params.export_cgraph) {
         llama_eval_export(ctx, "llama.ggml");

From 4829c6224e6156ac8b7a7f063489ddf6bb123c0c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 10:15:30 -0500
Subject: [PATCH 05/35] Revert accidental removal of ggml_mpi_backend_init

---
 llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index fa170e7df..1f674f13f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12709,6 +12709,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
     if (numa != GGML_NUMA_STRATEGY_DISABLED) {
         ggml_numa_init(numa);
     }
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_backend_init();
+#endif
 }
 
 void llama_backend_free(void) {

From 16eff5af6971205c5eff94e18d3c510b35ddf0e9 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 17:41:57 -0500
Subject: [PATCH 06/35] Disable warmup under MPI

---
 common/common.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index 0d9b19cbe..a6bdae68f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1402,10 +1402,17 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     {
         LOG("warming up the model with an empty run\n");
 
+#ifndef GGML_USE_MPI
+        // When using MPI, llama_eval() enters into an infinite loop
+        // on non-head nodes. Thus, we only want to warmup the model here
+        // if we aren't using MPI.
+        // FIXME have a way to terminate the infinite loop so we can warmup the model
+        //       in MPI mode
         std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
+#endif
         llama_reset_timings(lctx);
     }
 

From 8fe813130a4bc7dc9447ebc1f7393c18bc53e84c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 17:42:41 -0500
Subject: [PATCH 07/35] Update MPI example to follow main changes

---
 examples/mpi/mpi.cpp | 416 ++++++++++++++++++++-----------------------
 ggml-mpi.c           |   2 -
 2 files changed, 195 insertions(+), 223 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 393ef1b2a..84f15a82d 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -1,9 +1,5 @@
-// Defines sigaction on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
 #include "common.h"
+
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
@@ -40,7 +36,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@@ -49,10 +44,12 @@ static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 
-void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
 
+static void write_logfile(
+        const llama_context * ctx, const gpt_params & params, const llama_model * model,
+        const std::vector<llama_token> & input_tokens, const std::string & output,
+        const std::vector<llama_token> & output_tokens
+) {
     if (params.logdir.empty()) {
         return;
     }
@@ -93,7 +90,7 @@ void write_logfile(
 }
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-void sigint_handler(int signo) {
+static void sigint_handler(int signo) {
     if (signo == SIGINT) {
         if (!is_interacting) {
             is_interacting = true;
@@ -109,7 +106,6 @@ void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
-
     gpt_params params;
     g_params = &params;
 
@@ -156,6 +152,15 @@ int main(int argc, char ** argv) {
     }
     wordfree(&splitOptions);
 
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("main", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // TODO: Dump params ?
+    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
     console::init(params.simple_io, params.use_color);
@@ -178,34 +183,28 @@ int main(int argc, char ** argv) {
     }
 
     if (params.rope_freq_base != 10000.0) {
-        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 1.0) {
-        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
     }
 
-    if (params.n_ctx > 2048) {
-        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
-        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
-    } else if (params.n_ctx < 8) {
-        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
-    }
-
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
 
     if (params.seed == LLAMA_DEFAULT_SEED) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
         params.prompt = gpt_random_prompt(rng);
     }
 
+    LOG("%s: llama backend init\n", __func__);
     llama_backend_init(params.numa);
 
     llama_model * model;
@@ -215,6 +214,7 @@ int main(int argc, char ** argv) {
     g_ctx = &ctx;
 
     // load the model and apply lora adapter, if any
+    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (params.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
@@ -222,14 +222,23 @@ int main(int argc, char ** argv) {
     }
 
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n", __func__);
         return 1;
     }
 
+    const int n_ctx_train = llama_n_ctx_train(ctx);
+    if (params.n_ctx > n_ctx_train) {
+        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+        LOG_TEE("\n");
+        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
@@ -241,13 +250,14 @@ int main(int argc, char ** argv) {
 
         return 0;
     }
+
     llama_split_layers_weighted(ctx, params.mpi_layer_split);
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
 
     if (!path_session.empty()) {
-        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
 
         // fopen to check for existing session
         FILE * fp = std::fopen(path_session.c_str(), "rb");
@@ -257,33 +267,38 @@ int main(int argc, char ** argv) {
             session_tokens.resize(params.n_ctx);
             size_t n_token_count_out = 0;
             if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
             llama_set_rng_seed(ctx, params.seed);
 
-            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
         } else {
-            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+            LOG_TEE("%s: session file does not exist, will create\n", __func__);
         }
     }
 
-    // Add BOS if SPM tokenizer
     const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    LOG("add_bos: %d\n", add_bos);
 
-    // tokenize the prompt
     std::vector<llama_token> embd_inp;
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        LOG("tokenize the prompt\n");
         embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
     } else {
+        LOG("use session tokens\n");
         embd_inp = session_tokens;
     }
 
+    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(ctx));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
     }
 
     // Tokenize negative prompt
@@ -291,23 +306,31 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+
         guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
 
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
+        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
     }
 
     const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);
 
     if ((int) embd_inp.size() > n_ctx - 4) {
-        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
 
     // debug message about similarity of saved session, if applicable
     size_t n_matching_session_tokens = 0;
-    if (session_tokens.size()) {
+    if (!session_tokens.empty()) {
         for (llama_token id : session_tokens) {
             if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                 break;
@@ -315,22 +338,27 @@ int main(int argc, char ** argv) {
             n_matching_session_tokens++;
         }
         if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+            LOG_TEE("%s: using full prompt from session file\n", __func__);
         } else if (n_matching_session_tokens >= embd_inp.size()) {
-            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
         } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         } else {
-            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         }
     }
 
+    LOGLN(
+            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
+            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
+
     // if we will use the cache for the full prompt without reaching the end of the cache, force
     // reevaluation of the last token token to recalculate the cached logits
-    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
-            session_tokens.size() > embd_inp.size()) {
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
+        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+
         session_tokens.resize(embd_inp.size() - 1);
     }
 
@@ -343,6 +371,9 @@ int main(int argc, char ** argv) {
     const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
     const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
 
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
         params.interactive_first = true;
@@ -355,30 +386,30 @@ int main(int argc, char ** argv) {
     }
 
     if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_TEE("\n");
+        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (ctx_guidance) {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
-            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            LOG_TEE("\n");
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
             }
         }
 
         if (params.n_keep > 0) {
-        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            fprintf(stderr, "'\n");
+            LOG_TEE("'\n");
         }
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
     }
 
     if (params.interactive) {
@@ -395,58 +426,59 @@ int main(int argc, char ** argv) {
         SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+        LOG_TEE("%s: interactive mode on.\n", __func__);
 
-        if (params.antiprompt.size()) {
-            for (auto antiprompt : params.antiprompt) {
-                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+        if (!params.antiprompt.empty()) {
+            for (const auto & antiprompt : params.antiprompt) {
+                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
             }
         }
 
         if (params.input_prefix_bos) {
-            fprintf(stderr, "Input prefix with BOS\n");
+            LOG_TEE("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
         }
 
         if (!params.input_suffix.empty()) {
-            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
             params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
-    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    fprintf(stderr, "\n\n");
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("\n\n");
 
+    struct llama_grammar * grammar = NULL;
     grammar_parser::parse_state parsed_grammar;
-    llama_grammar *             grammar = NULL;
+
     if (!params.grammar.empty()) {
         parsed_grammar = grammar_parser::parse(params.grammar.c_str());
         // will be empty (default) if there are parse errors
         if (parsed_grammar.rules.empty()) {
             return 1;
         }
-        fprintf(stderr, "%s: grammar:\n", __func__);
+        LOG_TEE("%s: grammar:\n", __func__);
         grammar_parser::print_grammar(stderr, parsed_grammar);
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
 
         {
             auto it = params.logit_bias.find(llama_token_eos(ctx));
             if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
 
         std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
         grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
 
     // TODO: replace with ring-buffer
-    std::vector<llama_token> last_n_tokens(n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+    std::vector<llama_token> last_tokens(n_ctx);
+    std::fill(last_tokens.begin(), last_tokens.end(), 0);
 
     if (params.interactive) {
         const char *control_message;
@@ -458,11 +490,11 @@ int main(int argc, char ** argv) {
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        fprintf(stderr, "== Running in interactive mode. ==\n"
+        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-               " - Press Ctrl+C to interject at any time.\n"
+        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-               "%s\n", control_message);
+        LOG_TEE(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
@@ -487,27 +519,27 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    // do one empty run to warm up the model
-    {
-        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
-        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        llama_reset_timings(ctx);
-    }
+    const int n_vocab = llama_n_vocab(ctx);
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
             // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
             // --prompt or --file which uses the same value.
-            auto max_embd_size = n_ctx - 4;
+            int max_embd_size = n_ctx - 4;
+
             // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int)embd.size() > max_embd_size) {
-                auto skipped_tokens = embd.size() - max_embd_size;
+            if ((int) embd.size() > max_embd_size) {
+                const int skipped_tokens = (int) embd.size() - max_embd_size;
+                embd.resize(max_embd_size);
+
                 console::set_display(console::error);
-                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 console::set_display(console::reset);
                 fflush(stdout);
-                embd.resize(max_embd_size);
             }
 
             // infinite text generation via context swapping
@@ -516,28 +548,26 @@ int main(int argc, char ** argv) {
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
             if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                 if (params.n_predict == -2) {
-                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                     break;
                 }
 
                 const int n_left = n_past - params.n_keep;
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
+
                 // always keep the first token - BOS
-                n_past = std::max(1, params.n_keep);
+                n_past          = std::max(1, params.n_keep);
                 n_past_guidance = std::max(1, params.n_keep + guidance_offset);
 
-                // insert n_left/2 tokens at the start of embd from last_n_tokens
-                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
-                // stop saving session if we run out of context
+                // insert n_left/2 tokens at the start of embd from last_tokens
+                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
+
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+
+                LOG("clear session path\n");
                 path_session.clear();
-
-                //printf("\n---\n");
-                //printf("resetting: '");
-                //for (int i = 0; i < (int) embd.size(); i++) {
-                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
-                //}
-                //printf("'\n");
-                //printf("\n---\n");
             }
 
             // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
@@ -567,7 +597,7 @@ int main(int argc, char ** argv) {
 
             if (ctx_guidance) {
                 int input_size = 0;
-                llama_token* input_buf = NULL;
+                llama_token * input_buf = NULL;
 
                 if (n_past_guidance < (int) guidance_inp.size()) {
                     // Guidance context should have the same data with these modifications:
@@ -577,28 +607,25 @@ int main(int argc, char ** argv) {
                     embd_guidance = guidance_inp;
                     if (embd.begin() + original_prompt_len < embd.end()) {
                         embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
+                                embd_guidance.end(),
+                                embd.begin() + original_prompt_len,
+                                embd.end()
                         );
                     }
 
-                    input_buf = embd_guidance.data();
+                    input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
-                    //fprintf(stderr, "\n---------------------\n");
-                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
-                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
-                    //}
-                    //fprintf(stderr, "\n---------------------\n");
+
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
                 } else {
-                    input_buf = embd.data();
+                    input_buf  = embd.data();
                     input_size = embd.size();
                 }
 
                 for (int i = 0; i < input_size; i += params.n_batch) {
                     int n_eval = std::min(input_size - i, params.n_batch);
                     if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
-                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        LOG_TEE("%s : failed to eval\n", __func__);
                         return 1;
                     }
 
@@ -611,14 +638,20 @@ int main(int argc, char ** argv) {
                 if (n_eval > params.n_batch) {
                     n_eval = params.n_batch;
                 }
+
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+
                 if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                     return 1;
                 }
+
                 n_past += n_eval;
+
+                LOG("n_past = %d\n", n_past);
             }
 
-            if (embd.size() > 0 && !path_session.empty()) {
+            if (!embd.empty() && !path_session.empty()) {
                 session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
                 n_session_consumed = session_tokens.size();
             }
@@ -628,106 +661,21 @@ int main(int argc, char ** argv) {
         embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // out of user input, sample next token
-            const float   temp            = params.temp;
-            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
-            const float   top_p           = params.top_p;
-            const float   tfs_z           = params.tfs_z;
-            const float   typical_p       = params.typical_p;
-            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-            const float   repeat_penalty  = params.repeat_penalty;
-            const float   alpha_presence  = params.presence_penalty;
-            const float   alpha_frequency = params.frequency_penalty;
-            const int     mirostat        = params.mirostat;
-            const float   mirostat_tau    = params.mirostat_tau;
-            const float   mirostat_eta    = params.mirostat_eta;
-            const bool    penalize_nl     = params.penalize_nl;
-
             // optionally save the session on first sample (for faster prompt loading next time)
             if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                 need_to_save_session = false;
                 llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+
+                LOG("saved session to %s\n", path_session.c_str());
             }
 
-            llama_token id = 0;
+            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
 
-            {
-                auto logits  = llama_get_logits(ctx);
-                auto n_vocab = llama_n_vocab(ctx);
+            last_tokens.erase(last_tokens.begin());
+            last_tokens.push_back(id);
 
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
 
-                std::vector<llama_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                if (ctx_guidance) {
-                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
-                }
-
-                // Apply penalties
-                float nl_logit = logits[llama_token_nl(ctx)];
-                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-                llama_sample_repetition_penalty(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, repeat_penalty);
-                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, alpha_frequency, alpha_presence);
-                if (!penalize_nl) {
-                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
-                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
-                            candidates_p.data[idx].logit = nl_logit;
-                            break;
-                        }
-                    }
-                }
-
-                if (grammar != NULL) {
-                    llama_sample_grammar(ctx, &candidates_p, grammar);
-                }
-
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = llama_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token(ctx, &candidates_p);
-                    }
-                }
-                // printf("`%d`", candidates_p.size);
-
-                if (grammar != NULL) {
-                    llama_grammar_accept_token(ctx, grammar, id);
-                }
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-            }
-
-            // add it to the context
             embd.push_back(id);
 
             // echo this to console
@@ -735,12 +683,15 @@ int main(int argc, char ** argv) {
 
             // decrement remaining sampling budget
             --n_remain;
+
+            LOG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
+            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
+                last_tokens.erase(last_tokens.begin());
+                last_tokens.push_back(embd_inp[n_consumed]);
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -764,17 +715,16 @@ int main(int argc, char ** argv) {
             fflush(stdout);
         }
         // reset color to default if we there is no pending user input
-        if (input_echo && (int)embd_inp.size() == n_consumed) {
+        if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
         }
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-
             // check for reverse prompt
-            if (params.antiprompt.size()) {
+            if (!params.antiprompt.empty()) {
                 std::string last_output;
-                for (auto id : last_n_tokens) {
+                for (auto id : last_tokens) {
                     last_output += llama_token_to_piece(ctx, id);
                 }
 
@@ -785,10 +735,10 @@ int main(int argc, char ** argv) {
                 for (std::string & antiprompt : params.antiprompt) {
                     size_t extra_padding = params.interactive ? 0 : 2;
                     size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
-                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
-                        : 0;
+                                              ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                                              : 0;
 
-                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                         if (params.interactive) {
                             is_interacting = true;
                             console::set_display(console::user_input);
@@ -798,12 +748,18 @@ int main(int argc, char ** argv) {
                         break;
                     }
                 }
+
+                if (is_antiprompt) {
+                    LOG("found antiprompt: %s\n", last_output.c_str());
+                }
             }
 
             // deal with end of text token in interactive mode
-            if (last_n_tokens.back() == llama_token_eos(ctx)) {
+            if (last_tokens.back() == llama_token_eos(ctx)) {
+                LOG("found EOS token\n");
+
                 if (params.interactive) {
-                    if (params.antiprompt.size() != 0) {
+                    if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
                         const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
@@ -820,16 +776,20 @@ int main(int argc, char ** argv) {
             }
 
             if (n_past > 0 && is_interacting) {
+                LOG("waiting for user input\n");
+
                 if (params.instruct) {
                     printf("\n> ");
                 }
 
                 if (params.input_prefix_bos) {
+                    LOG("adding input prefix BOS token\n");
                     embd_inp.push_back(llama_token_bos(ctx));
                 }
 
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
+                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                     buffer += params.input_prefix;
                     printf("%s", buffer.c_str());
                 }
@@ -849,23 +809,30 @@ int main(int argc, char ** argv) {
                 if (buffer.length() > 1) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
+                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                         buffer += params.input_suffix;
                         printf("%s", params.input_suffix.c_str());
                     }
 
+                    LOG("buffer: '%s'\n", buffer.c_str());
+
                     const size_t original_size = embd_inp.size();
 
                     // instruct mode: insert instruction prefix
                     if (params.instruct && !is_antiprompt) {
+                        LOG("inserting instruction prefix\n");
                         n_consumed = embd_inp.size();
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
 
-                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
                     // instruct mode: insert response suffix
                     if (params.instruct) {
+                        LOG("inserting instruction suffix\n");
                         embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                     }
 
@@ -876,6 +843,9 @@ int main(int argc, char ** argv) {
                     }
 
                     n_remain -= line_inp.size();
+                    LOG("n_remain: %d\n", n_remain);
+                } else {
+                    LOG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -887,10 +857,10 @@ int main(int argc, char ** argv) {
                     if (grammar != NULL) {
                         llama_grammar_free(grammar);
 
-                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
                         grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
+                                grammar_rules.data(), grammar_rules.size(),
+                                parsed_grammar.symbol_ids.at("root"));
                     }
                 }
                 is_interacting = false;
@@ -899,7 +869,7 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
-            fprintf(stderr, " [end of text]\n");
+            LOG_TEE(" [end of text]\n");
             break;
         }
 
@@ -912,7 +882,7 @@ int main(int argc, char ** argv) {
     }
 
     if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
         llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
@@ -928,5 +898,9 @@ int main(int argc, char ** argv) {
     }
     llama_backend_free();
 
+#ifndef LOG_DISABLE_LOGS
+    LOG_TEE("Log end\n")
+#endif // LOG_DISABLE_LOGS
+
     return 0;
 }
diff --git a/ggml-mpi.c b/ggml-mpi.c
index cef5ca6da..9217651d6 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -166,9 +166,7 @@ void ggml_mpi_scatter_layers(
 
     if (layer_ranges != NULL) {
         for (int i = 0; i < ctx_mpi->size * 2; i += 2) {
-            fprintf(stderr, "In iteration %d\n", i);
             flattened_ranges[i] = layer_ranges[i/2][0];
-            fprintf(stderr, "Got first element\n");
             flattened_ranges[i + 1] = layer_ranges[i/2][1];
         }
     }

From 6c07d6cfa1065fb7526adf32b1e5310362c18ccb Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 17:52:57 -0500
Subject: [PATCH 08/35] Remove fprintf logs from mpi main

---
 examples/mpi/mpi.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 84f15a82d..0bf8f2f80 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -134,16 +134,8 @@ int main(int argc, char ** argv) {
     // wordexp doesn't work right if there's a trailing newline, so strip it
     rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
 
-    printf("%s", rawOptions.c_str());
-
     wordexp_t  splitOptions;
     wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
-    fprintf(stderr, "Loaded arguments: ");
-    for (int i = 0; i < splitOptions.we_wordc; i++) {
-
-        fprintf(stderr, " %s", splitOptions.we_wordv[i]);
-    }
-    fprintf(stderr, "\n");
 
     // Now we can parse like normal, but using the loaded options instead of the passed argv
     if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {

From 364b7071308e57eb287558d9ff80de58806c71ad Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 25 Sep 2023 18:59:14 -0500
Subject: [PATCH 09/35] Remove unrelated sections from mpi readme

---
 examples/mpi/README.md | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/examples/mpi/README.md b/examples/mpi/README.md
index 44a047915..4b934b0ed 100644
--- a/examples/mpi/README.md
+++ b/examples/mpi/README.md
@@ -48,26 +48,6 @@ AI: What would you like to talk about?
 User:'
 ```
 
-#### Windows:
-
-```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
-```
-
-The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
-
-#### Unix-based systems (Linux, macOS, etc.):
-
-```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
-```
-
-#### Windows:
-
-```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
-```
-
 ## Common Options
 
 In this section, we cover the most commonly used options for running the `mpi` program with the LLaMA models:

From fda60ead35f76ff05008e95a4c772772dfba50db Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 28 Sep 2023 12:39:34 -0500
Subject: [PATCH 10/35] Replace vector with C-style array and length in
 llama_split_layers_weighted

---
 examples/mpi/mpi.cpp | 2 +-
 llama.cpp            | 6 +++---
 llama.h              | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 0bf8f2f80..5dfa70f5c 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -243,7 +243,7 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    llama_split_layers_weighted(ctx, params.mpi_layer_split);
+    llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
diff --git a/llama.cpp b/llama.cpp
index 1f674f13f..98ffa1075 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13087,12 +13087,12 @@ struct llama_context * llama_new_context_with_model(
     return ctx;
 }
 
-void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights) {
+void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
 #ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != device_weights.size()) {
+    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
         GGML_ASSERT(false && "Must have same number of split percentages as devices");
     }
-    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights.data());
+    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
     ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
 #endif
 }
diff --git a/llama.h b/llama.h
index 0a13b037d..7ad4c9257 100644
--- a/llama.h
+++ b/llama.h
@@ -358,7 +358,7 @@ extern "C" {
                              const char * path_model,
             struct llama_model_params     params);
 
-    LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, std::vector<float> device_weights);
+    LLAMA_API void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights);
 
     LLAMA_API void llama_free_model(struct llama_model * model);
 

From 50a63eb5f958947b66d9949ff2efee8f759f9999 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 24 Oct 2023 12:00:52 -0500
Subject: [PATCH 11/35] Fix minor rebase errors

---
 llama.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama.h b/llama.h
index 7ad4c9257..fe6ba4a47 100644
--- a/llama.h
+++ b/llama.h
@@ -8,7 +8,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
-#include <vector>
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD

From ede7ff0c6696c381de9aa3b13b24bc8bab7807d7 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 25 Oct 2023 17:15:11 -0500
Subject: [PATCH 12/35] Fix MPI compilation errors

---
 examples/mpi/mpi.cpp | 234 ++++++++++++++++++++-----------------------
 1 file changed, 106 insertions(+), 128 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 5dfa70f5c..3030918bf 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -3,7 +3,6 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
-#include "grammar-parser.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -143,6 +142,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
     wordfree(&splitOptions);
+    llama_sampling_params & sparams = params.sparams;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
-    if (params.perplexity) {
+    if (params.logits_all) {
         printf("\n************\n");
         printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
         printf("************\n\n");
@@ -174,12 +174,17 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    if (params.rope_freq_base != 10000.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    if (params.n_ctx != 0 && params.n_ctx < 8) {
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
     }
 
-    if (params.rope_freq_scale != 1.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    if (params.rope_freq_base != 0.0) {
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 0.0) {
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
     LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -208,7 +213,7 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
+    if (sparams.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
@@ -218,29 +223,19 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const int n_ctx_train = llama_n_ctx_train(ctx);
-    if (params.n_ctx > n_ctx_train) {
+    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);
+
+    if (n_ctx > n_ctx_train) {
         LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    } else if (params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+                __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
         LOG_TEE("\n");
-        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-
-    // export the cgraph and exit
-    if (params.export_cgraph) {
-        llama_eval_export(ctx, "llama.ggml");
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
+        LOG_TEE("%s\n", get_system_info(params).c_str());
     }
 
     llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
@@ -256,7 +251,7 @@ int main(int argc, char ** argv) {
         if (fp != NULL) {
             std::fclose(fp);
 
-            session_tokens.resize(params.n_ctx);
+            session_tokens.resize(n_ctx);
             size_t n_token_count_out = 0;
             if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                 LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@@ -271,26 +266,26 @@ int main(int argc, char ** argv) {
         }
     }
 
-    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
     LOG("add_bos: %d\n", add_bos);
 
     std::vector<llama_token> embd_inp;
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
     } else {
         LOG("use session tokens\n");
         embd_inp = session_tokens;
     }
 
     LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
     // Tokenize negative prompt
@@ -298,13 +293,13 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
 
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
 
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
 
         original_prompt_len = original_inp.size();
         guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -312,9 +307,6 @@ int main(int argc, char ** argv) {
         LOG("guidance_offset:     %s", log_tostr(guidance_offset));
     }
 
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
-
     if ((int) embd_inp.size() > n_ctx - 4) {
         LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
@@ -340,6 +332,9 @@ int main(int argc, char ** argv) {
             LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                     __func__, n_matching_session_tokens, embd_inp.size());
         }
+
+        // remove any "future" tokens that we might have inherited from the previous session
+        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
     }
 
     LOGLN(
@@ -360,11 +355,11 @@ int main(int argc, char ** argv) {
     }
 
     // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
@@ -387,7 +382,7 @@ int main(int argc, char ** argv) {
 
         if (ctx_guidance) {
             LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -423,6 +418,12 @@ int main(int argc, char ** argv) {
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
                 LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                if (params.verbose_prompt) {
+                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
             }
         }
 
@@ -432,45 +433,27 @@ int main(int argc, char ** argv) {
 
         if (!params.input_prefix.empty()) {
             LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
         }
 
         if (!params.input_suffix.empty()) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-        }
-    }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
-
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = params.logit_bias.find(llama_token_eos(ctx));
-            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
             }
         }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
+    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("\n\n");
 
     if (params.interactive) {
         const char *control_message;
@@ -511,10 +494,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd;
     std::vector<llama_token> embd_guidance;
 
-    const int n_vocab = llama_n_vocab(ctx);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -544,19 +524,24 @@ int main(int argc, char ** argv) {
                     break;
                 }
 
-                const int n_left = n_past - params.n_keep;
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
+                const int n_left    = n_past - params.n_keep - 1;
+                const int n_discard = n_left/2;
 
-                // always keep the first token - BOS
-                n_past          = std::max(1, params.n_keep);
-                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+                n_past -= n_discard;
+
+                if (ctx_guidance) {
+                    n_past_guidance -= n_discard;
+                }
 
                 LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
-                // insert n_left/2 tokens at the start of embd from last_tokens
-                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
-
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
                 LOG("clear session path\n");
                 path_session.clear();
@@ -586,7 +571,6 @@ int main(int argc, char ** argv) {
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
-
             if (ctx_guidance) {
                 int input_size = 0;
                 llama_token * input_buf = NULL;
@@ -608,7 +592,7 @@ int main(int argc, char ** argv) {
                     input_buf  = embd_guidance.data();
                     input_size = embd_guidance.size();
 
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
                 } else {
                     input_buf  = embd.data();
                     input_size = embd.size();
@@ -616,7 +600,7 @@ int main(int argc, char ** argv) {
 
                 for (int i = 0; i < input_size; i += params.n_batch) {
                     int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
                         LOG_TEE("%s : failed to eval\n", __func__);
                         return 1;
                     }
@@ -631,9 +615,9 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
-                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                     LOG_TEE("%s : failed to eval\n", __func__);
                     return 1;
                 }
@@ -661,12 +645,11 @@ int main(int argc, char ** argv) {
                 LOG("saved session to %s\n", path_session.c_str());
             }
 
-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
 
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
 
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
 
             embd.push_back(id);
 
@@ -682,8 +665,11 @@ int main(int argc, char ** argv) {
             LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
@@ -706,19 +692,17 @@ int main(int argc, char ** argv) {
             }
             fflush(stdout);
         }
-        // reset color to default if we there is no pending user input
+        // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
         }
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt
+            // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
-                std::string last_output;
-                for (auto id : last_tokens) {
-                    last_output += llama_token_to_piece(ctx, id);
-                }
+                const int n_prev = 32;
+                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -733,10 +717,8 @@ int main(int argc, char ** argv) {
                     if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                         if (params.interactive) {
                             is_interacting = true;
-                            console::set_display(console::user_input);
                         }
                         is_antiprompt = true;
-                        fflush(stdout);
                         break;
                     }
                 }
@@ -747,21 +729,19 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (last_tokens.back() == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
 
                     is_interacting = true;
                     printf("\n");
-                    console::set_display(console::user_input);
-                    fflush(stdout);
                 } else if (params.instruct) {
                     is_interacting = true;
                 }
@@ -782,10 +762,12 @@ int main(int argc, char ** argv) {
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
                     LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    printf("%s", params.input_prefix.c_str());
                 }
 
+                // color user input only
+                console::set_display(console::user_input);
+
                 std::string line;
                 bool another_line = true;
                 do {
@@ -802,7 +784,6 @@ int main(int argc, char ** argv) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
                         LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
                         printf("%s", params.input_suffix.c_str());
                     }
 
@@ -816,11 +797,18 @@ int main(int argc, char ** argv) {
                         n_consumed = embd_inp.size();
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
+                    if (params.escape) {
+                        process_escapes(buffer);
+                    }
 
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
 
+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
                     // instruct mode: insert response suffix
                     if (params.instruct) {
@@ -845,15 +833,7 @@ int main(int argc, char ** argv) {
 
             if (n_past > 0) {
                 if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                                grammar_rules.data(), grammar_rules.size(),
-                                parsed_grammar.symbol_ids.at("root"));
-                    }
+                    llama_sampling_reset(ctx_sampling);
                 }
                 is_interacting = false;
             }
@@ -885,13 +865,11 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
     llama_free_model(model);
 
-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
+    llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
 #ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n")
+    LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
 
     return 0;

From bcfb190c2865389f6805cc89b4709c2a270002f7 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sun, 29 Oct 2023 15:16:16 -0500
Subject: [PATCH 13/35] Synchronize batch sequence info, fixing MPI for
 llama_decode()

---
 common/common.cpp |  2 +-
 ggml-mpi.c        | 63 ++++++++++++++++++++++++++++++++++++++++++-----
 ggml-mpi.h        | 21 +++++++++++-----
 llama.cpp         | 29 +++++++++++++++-------
 4 files changed, 93 insertions(+), 22 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index a6bdae68f..c58477fd6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1403,7 +1403,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         LOG("warming up the model with an empty run\n");
 
 #ifndef GGML_USE_MPI
-        // When using MPI, llama_eval() enters into an infinite loop
+        // When using MPI, llama_decode() enters into an infinite loop
         // on non-head nodes. Thus, we only want to warmup the model here
         // if we aren't using MPI.
         // FIXME have a way to terminate the infinite loop so we can warmup the model
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 9217651d6..1e4d0b376 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -60,16 +60,67 @@ int ggml_mpi_size(struct ggml_mpi_context * ctx) {
 }
 
 void ggml_mpi_eval_init(
-        struct ggml_mpi_context * ctx_mpi,
-                            int * n_tokens,
-                            int * n_past,
-                            int * n_threads) {
+        struct ggml_mpi_context *   ctx_mpi,
+                int32_t         *   n_tokens,
+                int32_t         **  pos,
+                int32_t         **  n_seq_ids,
+                int32_t         *** seq_id,
+                int8_t          **  logits) {
 
 
     MPI_Barrier(ctx_mpi->comm);
 
-    MPI_Bcast(n_tokens,  1, MPI_INT, 0, ctx_mpi->comm);
-    MPI_Bcast(n_past,    1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
+
+    if (ctx_mpi->rank != 0) {
+        *pos = calloc(*n_tokens, sizeof(int32_t));
+        *n_seq_ids = calloc(*n_tokens, sizeof(int32_t));
+        *logits = calloc(*n_tokens, sizeof(int8_t));
+    }
+
+    int32_t total_n_seq_ids = 0;
+    for (size_t i = 0; i < *n_tokens; i++) {
+        total_n_seq_ids += (*n_seq_ids)[i];
+    }
+
+    MPI_Bcast(&total_n_seq_ids,     1,               MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(*n_seq_ids,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
+
+    int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t));
+
+    int32_t current_index = 0;
+
+    if (ctx_mpi->rank == 0) {
+        for (size_t i = 0; i < *n_tokens; i++) {
+            for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+                flattened_seq_ids[current_index] = (*seq_id)[i][j];
+                current_index++;
+            }
+        }
+    }
+
+
+    MPI_Bcast(*pos,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(flattened_seq_ids,    total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
+    //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
+    int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
+    current_index = 0;
+    for (size_t i = 0; i < *n_tokens; i++) {
+        new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
+        for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+            new_seq_id[i][j] = flattened_seq_ids[current_index];
+            current_index++;
+        }
+    }
+    free(flattened_seq_ids);
+    *seq_id = new_seq_id;
+}
+
+void ggml_mpi_synch_int(
+        struct ggml_mpi_context     * ctx_mpi,
+        int32_t * val
+) {
+    MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 }
 
 static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 7eeb3856f..f3c4bf2aa 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -110,14 +110,23 @@ int ggml_mpi_size(struct ggml_mpi_context * ctx);
  *
  * @param ctx_mpi The context in which to prepare for evaluation.
  * @param n_tokens A pointer to the n_tokens, which will be synchronized after this function.
- * @param n_past A pointer to the n_past, which will be synchronized after this function.
- * @param n_threads A pointer to the n_threads, which is unused currently.
+ * @param pos A pointer to the pos array, which will be synchronized after this function.
+ * @param n_seq_ids A pointer to the n_seq_ids array, which will be synchronized after this function.
+ * @param seq_id A pointer to the seq_id 2D array, which will be synchronized after this function.
+ * @param logits A pointer to the logits array, which is unused currently since only node 0 needs them.
  */
 void ggml_mpi_eval_init(
-        struct ggml_mpi_context * ctx_mpi,
-                            int * n_tokens,
-                            int * n_past,
-                            int * n_threads);
+        struct ggml_mpi_context *   ctx_mpi,
+                int32_t         *   n_tokens,
+                int32_t         **  pos,
+                int32_t         **  n_seq_ids,
+                int32_t         *** seq_id,
+                int8_t          **  logits);
+
+void ggml_mpi_synch_int(
+        struct ggml_mpi_context     * ctx_mpi,
+                int32_t * val
+        );
 
 /**
  * Split a range across all nodes within the given
diff --git a/llama.cpp b/llama.cpp
index 98ffa1075..a5f56b552 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8776,8 +8776,7 @@ static int llama_decode_internal(
          llama_context & lctx,
            llama_batch   batch_all) { // TODO: rename back to batch
 
-    const uint32_t n_tokens_all = batch_all.n_tokens;
-
+    uint32_t n_tokens_all = batch_all.n_tokens;
     if (n_tokens_all == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
         return -1;
@@ -8798,11 +8797,7 @@ static int llama_decode_internal(
     }
     lctx.n_queued_tokens += n_tokens_all;
 
-#ifdef GGML_USE_MPI
-    // TODO: needs fix after #3228
-    GGML_ASSERT(false && "not implemented");
-    //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
-#endif
+
 
     auto & kv_self = lctx.kv_self;
 
@@ -8828,7 +8823,7 @@ static int llama_decode_internal(
     std::vector<std::vector<llama_seq_id>> seq_id;
 
     for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
-        const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
+        uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
         llama_batch u_batch = {
             /* .n_tokens   = */ (int32_t) n_tokens,
             /* .token      = */ batch_all.token     ? batch_all.token    + cur_token        : nullptr,
@@ -8881,7 +8876,12 @@ static int llama_decode_internal(
                 kv_self.head = 0;
             }
 
-            if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
+    #ifdef GGML_USE_MPI
+        // TODO: needs fix after #3228
+        ggml_mpi_eval_init(lctx.ctx_mpi, &(u_batch.n_tokens), &(u_batch.pos), &(u_batch.n_seq_id), &(u_batch.seq_id), &(u_batch.logits));
+        n_tokens = u_batch.n_tokens;
+#endif
+        if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
                 return 1;
             }
 
@@ -13923,6 +13923,17 @@ void llama_batch_free(struct llama_batch batch) {
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
+
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
+        const int n_ctx = llama_n_ctx(ctx);
+        std::vector<llama_token> tmp(n_ctx, llama_token_bos(&ctx->model));
+        while (llama_decode_internal(*ctx, batch) >= 0){};
+        llama_backend_free();
+        exit(1);
+    }
+#endif
     const int ret = llama_decode_internal(*ctx, batch);
     if (ret < 0) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);

From 888d4f591b246ea74016a54f2a289aab24731744 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 30 Oct 2023 10:50:20 -0500
Subject: [PATCH 14/35] Update MPI code to new KV seq rm and bos/eos model APIs

---
 examples/mpi/mpi.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
index 3030918bf..b4944099e 100644
--- a/examples/mpi/mpi.cpp
+++ b/examples/mpi/mpi.cpp
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
+        embd_inp.push_back(llama_token_bos(model));
         LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
@@ -334,7 +334,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
+        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
     LOGLN(
@@ -729,7 +729,7 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -756,7 +756,7 @@ int main(int argc, char ** argv) {
 
                 if (params.input_prefix_bos) {
                     LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
+                    embd_inp.push_back(llama_token_bos(model));
                 }
 
                 std::string buffer;
@@ -840,7 +840,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
             LOG_TEE(" [end of text]\n");
             break;
         }

From b7599f7a563d567447e0c991e1dc1c84eb002e38 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 31 Oct 2023 15:55:15 -0500
Subject: [PATCH 15/35] Fix some mpi mem leaks, add mpi-layer-split to help
 when using mpi

---
 common/common.cpp |  3 +++
 ggml-mpi.c        | 52 +++++++++++++++++++++++++++++------------------
 ggml-mpi.h        |  3 ++-
 llama.cpp         |  1 +
 4 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c58477fd6..d924c80dc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1094,6 +1094,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
         printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
         printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
     }
+#ifdef GGML_USE_MPI
+    printf("  --mpi-layer-split N   percentiles to split the layers by across nodes\n");
+#endif
     printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
     printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
     printf("  -gan N, --grp-attn-n N\n");
diff --git a/ggml-mpi.c b/ggml-mpi.c
index 1e4d0b376..fd88eab1f 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -47,7 +47,7 @@ struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int
 }
 
 void ggml_mpi_free(struct ggml_mpi_context * ctx) {
-    MPI_Comm_free(ctx->comm);
+    MPI_Comm_free(&(ctx->comm));
     free(ctx);
 }
 
@@ -55,7 +55,7 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
     return ctx->rank;
 }
 
-int ggml_mpi_size(struct ggml_mpi_context * ctx) {
+size_t ggml_mpi_size(struct ggml_mpi_context * ctx) {
     return ctx->size;
 }
 
@@ -69,30 +69,41 @@ void ggml_mpi_eval_init(
 
 
     MPI_Barrier(ctx_mpi->comm);
-
+    int32_t old_n_tokens = *n_tokens;
     MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
 
-    if (ctx_mpi->rank != 0) {
-        *pos = calloc(*n_tokens, sizeof(int32_t));
-        *n_seq_ids = calloc(*n_tokens, sizeof(int32_t));
-        *logits = calloc(*n_tokens, sizeof(int8_t));
+    // If what was passed in differs from what was broadcast,
+    // we can't guarantee the allocated sizes are correct
+    // TODO check how often this is done and if it's a problem,
+    //      try to allocate ahead of time
+    if (old_n_tokens != *n_tokens) {
+        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
+        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
+        *logits = realloc(*logits, *n_tokens * sizeof(int32_t));
     }
 
+
+
+//    MPI_Bcast(&total_n_seq_ids,     1, MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(*n_seq_ids,   *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm);
+
+    // We need to know the total number of sequence
+    // ids, so we count them all up
     int32_t total_n_seq_ids = 0;
-    for (size_t i = 0; i < *n_tokens; i++) {
+    for (int32_t i = 0; i < *n_tokens; i++) {
         total_n_seq_ids += (*n_seq_ids)[i];
     }
 
-    MPI_Bcast(&total_n_seq_ids,     1,               MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(*n_seq_ids,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
-
+    // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
+    // for transit
     int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t));
 
     int32_t current_index = 0;
 
+    // Only rank 0 needs to flatten since the others don't have the real seq_id
     if (ctx_mpi->rank == 0) {
-        for (size_t i = 0; i < *n_tokens; i++) {
-            for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+        for (int32_t i = 0; i < *n_tokens; i++) {
+            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
                 flattened_seq_ids[current_index] = (*seq_id)[i][j];
                 current_index++;
             }
@@ -100,25 +111,26 @@ void ggml_mpi_eval_init(
     }
 
 
-    MPI_Bcast(*pos,                  *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(flattened_seq_ids,    total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(             *pos, *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
+    MPI_Bcast(flattened_seq_ids,  total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
     //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
     int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
     current_index = 0;
-    for (size_t i = 0; i < *n_tokens; i++) {
+    for (int32_t i = 0; i < *n_tokens; i++) {
         new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
-        for (size_t j = 0; j < (*n_seq_ids)[i]; j++) {
+        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
             new_seq_id[i][j] = flattened_seq_ids[current_index];
             current_index++;
         }
     }
     free(flattened_seq_ids);
+    //free(*seq_id); // <- something is still holding onto this, need to investigate
     *seq_id = new_seq_id;
 }
 
 void ggml_mpi_synch_int(
-        struct ggml_mpi_context     * ctx_mpi,
-        int32_t * val
+        struct ggml_mpi_context * ctx_mpi,
+                        int32_t * val
 ) {
     MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 }
@@ -284,7 +296,7 @@ void ggml_mpi_graph_compute_pre(
     {
 
 
-        const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
+        //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
 
         const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
 
diff --git a/ggml-mpi.h b/ggml-mpi.h
index f3c4bf2aa..62b15faef 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <stdint.h>
+#include <stddef.h>
 
 struct ggml_context;
 struct ggml_tensor;
@@ -98,7 +99,7 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx);
  * @param ctx The context containing the communicator used for this size check.
  * @return The number of nodes that are a part of the given context's communicator.
  */
-int ggml_mpi_size(struct ggml_mpi_context * ctx);
+size_t ggml_mpi_size(struct ggml_mpi_context * ctx);
 
 /**
  * Synchronize needed information among the nodes
diff --git a/llama.cpp b/llama.cpp
index a5f56b552..d1f356504 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13094,6 +13094,7 @@ void llama_split_layers_weighted(struct llama_context * ctx, float device_weight
     }
     uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
     ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+    free(ranges);
 #endif
 }
 

From 32078d6fe11bc42501b5f5f018c69b1c4ed7699c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 1 Nov 2023 12:23:30 -0500
Subject: [PATCH 16/35] Fix missing layer_inp_i names

---
 llama.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index d1f356504..ec3707ff2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5729,6 +5729,7 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * inpSA = inpL;
 
             // norm
@@ -5907,6 +5908,7 @@ struct llm_build_context {
         struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -6016,6 +6018,7 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,
@@ -6134,6 +6137,7 @@ struct llm_build_context {
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -6221,6 +6225,7 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * residual = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -6420,6 +6425,7 @@ struct llm_build_context {
         struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * inpSA = inpL;
 
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -6699,6 +6705,7 @@ struct llm_build_context {
         cb(inpL, "inp_norm", -1);
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
@@ -6786,6 +6793,7 @@ struct llm_build_context {
         struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
 
         for (int il = 0; il < n_layer; ++il) {
+            ggml_format_name(inpL, "layer_inp_%d", il); //MPI
             struct ggml_tensor * attn_norm;
 
             attn_norm = llm_build_norm(ctx0, inpL, hparams,

From c9d18263b35ae395435b4ca605ad6fb7c0440077 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 1 Nov 2023 14:55:32 -0500
Subject: [PATCH 17/35] Allow per-node threads to be set in command-line args,
 add mpi support to main

---
 common/common.cpp      | 58 ++++++++++++++++++++++++++++++++----------
 common/common.h        |  9 +++----
 examples/main/main.cpp |  2 ++
 llama.cpp              |  8 ++++++
 llama.h                |  3 +++
 5 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index d924c80dc..cae17a3d2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -162,18 +162,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads = std::stoi(argv[i]);
-            if (params.n_threads <= 0) {
-                params.n_threads = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads.resize(split_arg.size());
+            for (size_t i = 0; i < split_arg.size(); ++i) {
+                params.n_threads[i] = std::stoi(split_arg[i]);
+                if (params.n_threads[i] <= 0) {
+                    params.n_threads[i] = std::thread::hardware_concurrency();
+                }
             }
+
         } else if (arg == "-tb" || arg == "--threads-batch") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads_batch = std::stoi(argv[i]);
-            if (params.n_threads_batch <= 0) {
-                params.n_threads_batch = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads_batch.resize(split_arg.size());
+            for (size_t i = 0; i < split_arg.size(); ++i) {
+                params.n_threads_batch[i] = std::stoi(split_arg[i]);
+                if (params.n_threads_batch[i] <= 0) {
+                    params.n_threads_batch[i] = std::thread::hardware_concurrency();
+                }
             }
         } else if (arg == "-td" || arg == "--threads-draft") {
             if (++i >= argc) {
@@ -976,7 +995,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        (can be specified more than once for multiple prompts).\n");
     printf("  --color               colorise output to distinguish prompt and user input from generations\n");
     printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads[0]);
     printf("  -tb N, --threads-batch N\n");
     printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
     printf("  -td N, --threads-draft N");
@@ -1135,9 +1154,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 std::string get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.n_threads[0];
+    if (params.n_threads_batch[0] != -1) {
+        os << " (n_threads_batch = " << params.n_threads_batch[0] << ")";
     }
     os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
 
@@ -1318,8 +1337,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_seq_max         = params.n_parallel;
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.n_threads         = params.n_threads[0];
+    cparams.n_threads_batch   = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
@@ -1363,6 +1382,7 @@ void llama_batch_add(
 }
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
+    int32_t n_threads = params.n_threads[0];
     auto mparams = llama_model_params_from_gpt_params(params);
 
     llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
@@ -1380,6 +1400,16 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
+#ifdef GGML_USE_MPI
+    int node_id = llama_node_id(lctx);
+    n_threads = (node_id >= params.n_threads.size()) ? get_num_physical_cores() : params.n_threads[node_id];
+    int32_t n_threads_batch = (node_id >= params.n_threads_batch.size()) ? -1 : params.n_threads_batch[node_id];
+
+    params.n_threads[0] = n_threads; // So we can treat index 0 as what our n_threads is elsewhere
+    params.n_threads_batch[0] = n_threads_batch;
+    llama_set_n_threads(lctx, n_threads, (n_threads_batch > 0) ? n_threads_batch : get_num_physical_cores());
+#endif
+
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -1389,7 +1419,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
                                              ((i > 0) || params.lora_base.empty())
                                                 ? NULL
                                                 : params.lora_base.c_str(),
-                                             params.n_threads);
+                                             n_threads);
         if (err != 0) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
@@ -1806,7 +1836,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.n_threads[0], std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h
index f3b913d9d..9c7af1ee5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -44,11 +44,10 @@ int32_t get_num_physical_cores();
 
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
-
-    int32_t n_threads             = get_num_physical_cores();
-    int32_t n_threads_draft       = -1;
-    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft = -1;
+    std::vector<int32_t> n_threads                       = {get_num_physical_cores()};
+    std::vector<int32_t> n_threads_batch                 = {-1};    // number of threads to use for batch processing (-1 = use n_threads)
+    std::vector<int32_t> n_threads_draft                 = {get_num_physical_cores()};
+    std::vector<int32_t> n_threads_batch_draft           = {-1};    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size
     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index e2d07a631..b4b3f8a6c 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -207,6 +207,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
diff --git a/llama.cpp b/llama.cpp
index ec3707ff2..2bdb38434 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12668,6 +12668,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
     return result;
 }
 
+int llama_node_id(struct llama_context * ctx) {
+#ifdef GGML_USE_MPI
+    return ggml_mpi_rank(ctx->ctx_mpi);
+
+#endif
+    return 0;
+}
+
 size_t llama_max_devices(void) {
 #if defined(GGML_USE_METAL)
     return 1;
diff --git a/llama.h b/llama.h
index fe6ba4a47..818056064 100644
--- a/llama.h
+++ b/llama.h
@@ -372,6 +372,9 @@ extern "C" {
 
     LLAMA_API size_t llama_max_devices(void);
 
+    // Get the ID of this compute node, usually 0
+    // unless running MPI, in which case it is the rank of the node
+    LLAMA_API int llama_node_id(struct llama_context * ctx);
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
     LLAMA_API bool llama_supports_gpu_offload(void);

From aa166462f1e97a2c6a130f65ba5c77581967aa5a Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Sat, 3 Feb 2024 13:57:00 -0600
Subject: [PATCH 18/35] Fix draft thread args and remove grads from mpi
 eval_init

---
 common/common.cpp | 46 ++++++++++++++++++++++++++++++++--------------
 ggml-mpi.c        |  5 +++--
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index cae17a3d2..46ec366b0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -169,10 +169,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
             std::vector<std::string> split_arg{it, {}};
             params.n_threads.resize(split_arg.size());
-            for (size_t i = 0; i < split_arg.size(); ++i) {
-                params.n_threads[i] = std::stoi(split_arg[i]);
-                if (params.n_threads[i] <= 0) {
-                    params.n_threads[i] = std::thread::hardware_concurrency();
+            for (size_t node = 0; node < split_arg.size(); ++node) {
+                params.n_threads[node] = std::stoi(split_arg[node]);
+                if (params.n_threads[node] <= 0) {
+                    params.n_threads[node] = std::thread::hardware_concurrency();
                 }
             }
 
@@ -188,10 +188,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
             std::vector<std::string> split_arg{it, {}};
             params.n_threads_batch.resize(split_arg.size());
-            for (size_t i = 0; i < split_arg.size(); ++i) {
-                params.n_threads_batch[i] = std::stoi(split_arg[i]);
-                if (params.n_threads_batch[i] <= 0) {
-                    params.n_threads_batch[i] = std::thread::hardware_concurrency();
+            for (size_t node = 0; node < split_arg.size(); ++node) {
+                params.n_threads_batch[node] = std::stoi(split_arg[node]);
+                if (params.n_threads_batch[node] <= 0) {
+                    params.n_threads_batch[node] = std::thread::hardware_concurrency();
                 }
             }
         } else if (arg == "-td" || arg == "--threads-draft") {
@@ -199,18 +199,36 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads_draft = std::stoi(argv[i]);
-            if (params.n_threads_draft <= 0) {
-                params.n_threads_draft = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads_draft.resize(split_arg.size());
+            for (size_t node = 0; node < split_arg.size(); ++node) {
+                params.n_threads_draft[node] = std::stoi(split_arg[node]);
+                if (params.n_threads_draft[node] <= 0) {
+                    params.n_threads_draft[node] = std::thread::hardware_concurrency();
+                }
             }
         } else if (arg == "-tbd" || arg == "--threads-batch-draft") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_threads_batch_draft = std::stoi(argv[i]);
-            if (params.n_threads_batch_draft <= 0) {
-                params.n_threads_batch_draft = std::thread::hardware_concurrency();
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            params.n_threads_batch_draft.resize(split_arg.size());
+            for (size_t node = 0; node < split_arg.size(); ++node) {
+                params.n_threads_batch_draft[node] = std::stoi(split_arg[node]);
+                if (params.n_threads_batch_draft[node] <= 0) {
+                    params.n_threads_batch_draft[node] = std::thread::hardware_concurrency();
+                }
             }
         } else if (arg == "-p" || arg == "--prompt") {
             if (++i >= argc) {
diff --git a/ggml-mpi.c b/ggml-mpi.c
index fd88eab1f..c10faa252 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.c
@@ -261,7 +261,9 @@ void ggml_mpi_graph_compute_pre(
         return;
     }
 
-    GGML_ASSERT(inp0 == gf->nodes[0]);
+//    fprintf(stderr, "gf->nodes[0] == %s\n", ggml_get_name(gf->nodes[0]));
+//
+//    GGML_ASSERT(inp0 == gf->nodes[0]);
 
     // distribute the compute graph into slices across the MPI nodes
     //
@@ -333,7 +335,6 @@ void ggml_mpi_graph_compute_pre(
         // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
         for (int i = 1; i < idx_l1 - idx_l0; i++) {
             gf->nodes[i] = gf->nodes[idx_l0 + i];
-            gf->grads[i] = gf->grads[idx_l0 + i];
         }
 
         // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node

From b98274c76fa5c1b534237fd5327aae4a7435b0c5 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 5 Feb 2024 17:19:45 -0600
Subject: [PATCH 19/35] Begin transition to backend v2

---
 CMakeLists.txt             |   2 +-
 Makefile                   |   2 +-
 ggml-mpi.c => ggml-mpi.cpp | 210 +++++++++++++++++++++++++++++++------
 ggml-mpi.h                 |  19 ++++
 ggml.h                     |   1 +
 llama.cpp                  |  31 ++++++
 6 files changed, 233 insertions(+), 32 deletions(-)
 rename ggml-mpi.c => ggml-mpi.cpp (67%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ac2804a6..b805650d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -426,7 +426,7 @@ if (LLAMA_MPI)
         message(STATUS "MPI found")
 
         set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp)
 
         add_compile_definitions(GGML_USE_MPI)
         add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
diff --git a/Makefile b/Makefile
index cb597b209..f8ad9f146 100644
--- a/Makefile
+++ b/Makefile
@@ -573,7 +573,7 @@ endif
 endif # LLAMA_METAL
 
 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
diff --git a/ggml-mpi.c b/ggml-mpi.cpp
similarity index 67%
rename from ggml-mpi.c
rename to ggml-mpi.cpp
index c10faa252..b43dd96d1 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.cpp
@@ -1,11 +1,14 @@
 #include "ggml-mpi.h"
 
 #include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
 
 #include <mpi.h>
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -17,6 +20,8 @@ struct ggml_mpi_context {
     MPI_Comm comm;
     int layer_start;
     int layer_end;
+    struct ggml_tensor *inp0;
+    std::string name;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -29,7 +34,7 @@ void ggml_mpi_backend_free(void) {
 }
 
 struct ggml_mpi_context * ggml_mpi_init(void) {
-    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
+    auto * ctx = static_cast<ggml_mpi_context *>(calloc(1, sizeof(struct ggml_mpi_context)));
 
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
@@ -39,7 +44,7 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
 }
 
 struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) {
-    struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context));
+    auto * newCtx = static_cast<ggml_mpi_context *>(calloc(1, sizeof(struct ggml_mpi_context)));
     MPI_Comm_split(ctx->comm, color, key, &newCtx->comm);
     MPI_Comm_rank(newCtx->comm, &newCtx->rank);
     MPI_Comm_size(newCtx->comm, &newCtx->size);
@@ -70,16 +75,16 @@ void ggml_mpi_eval_init(
 
     MPI_Barrier(ctx_mpi->comm);
     int32_t old_n_tokens = *n_tokens;
-    MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_tokens, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 
     // If what was passed in differs from what was broadcast,
     // we can't guarantee the allocated sizes are correct
     // TODO check how often this is done and if it's a problem,
     //      try to allocate ahead of time
     if (old_n_tokens != *n_tokens) {
-        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
-        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
-        *logits = realloc(*logits, *n_tokens * sizeof(int32_t));
+        *pos = static_cast<int32_t *>(realloc(*pos, *n_tokens * sizeof(int32_t)));
+        *n_seq_ids = static_cast<int32_t *>(realloc(*n_seq_ids, *n_tokens * sizeof(int32_t)));
+        *logits = static_cast<int8_t *>(realloc(*logits, *n_tokens * sizeof(int32_t)));
     }
 
 
@@ -96,7 +101,7 @@ void ggml_mpi_eval_init(
 
     // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
     // for transit
-    int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t));
+    auto * flattened_seq_ids = static_cast<int32_t *>(calloc(total_n_seq_ids, sizeof(int32_t)));
 
     int32_t current_index = 0;
 
@@ -114,10 +119,10 @@ void ggml_mpi_eval_init(
     MPI_Bcast(             *pos, *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
     MPI_Bcast(flattened_seq_ids,  total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
     //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
-    int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
+    auto ** new_seq_id = static_cast<int32_t **>(calloc(*n_tokens, sizeof(int32_t *)));
     current_index = 0;
     for (int32_t i = 0; i < *n_tokens; i++) {
-        new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
+        new_seq_id[i] = static_cast<int32_t *>(calloc((*n_seq_ids)[i], sizeof(int32_t)));
         for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
             new_seq_id[i][j] = flattened_seq_ids[current_index];
             current_index++;
@@ -176,7 +181,7 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_C
     }
 
     MPI_Status status; UNUSED(status);
-
+    fprintf(stderr, "%s: tensor receive == null: %d\n", __func__, t->data == NULL);
     const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
@@ -241,11 +246,7 @@ void ggml_mpi_scatter_layers(
     fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
 }
 
-// TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
+void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int   n_layers) {
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
@@ -261,6 +262,8 @@ void ggml_mpi_graph_compute_pre(
         return;
     }
 
+    ctx_mpi->inp0 = inp0;
+
 //    fprintf(stderr, "gf->nodes[0] == %s\n", ggml_get_name(gf->nodes[0]));
 //
 //    GGML_ASSERT(inp0 == gf->nodes[0]);
@@ -278,23 +281,11 @@ void ggml_mpi_graph_compute_pre(
     //
 
 
-
-    if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
-        }
-    } else if (mpi_size > 1) {
-        // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
-
-        // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+    for (int i = 0; i < gf->n_nodes; i++) {
+        gf->nodes[i]->backend = GGML_BACKEND_MPI_SPLIT;
     }
 
+
     {
 
 
@@ -347,6 +338,47 @@ void ggml_mpi_graph_compute_pre(
     }
 }
 
+// TODO: there are many improvements that can be done to this implementation
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ctx_mpi->inp0;
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    if (mpi_rank > 0) {
+        if (mpi_rank == 1) {
+            // the first node (1) receives the input tokens from the main node (0)
+            if (inp_tokens->data == NULL) {
+
+            }
+            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
+        } else {
+            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            fprintf(stderr, "%s:%d: receiving layer inp0\n", __func__, ctx_mpi->rank);
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
+        }
+    } else if (mpi_size > 1) {
+        // node 0 sends the input tokens to node 1
+        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
+
+        // recv the output data from the last node
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+    }
+}
+
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf,
@@ -361,3 +393,121 @@ void ggml_mpi_graph_compute_post(
         ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
     }
 }
+
+// BACKEND V2
+
+static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_mpi_free(ggml_backend_t backend) {
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+
+    delete ctx;
+
+
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_mpi_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_cpu_buffer_type();
+}
+
+GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_CPY:
+            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
+        case GGML_OP_MUL_MAT:
+            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        default:
+            return true;
+    }
+
+    GGML_UNUSED(backend);
+}
+
+static struct ggml_backend_i mpi_backend_i = {
+        /* .get_name                = */ ggml_backend_mpi_name,
+        /* .free                    = */ ggml_backend_mpi_free,
+        /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
+        /* .set_tensor_async        = */ NULL,
+        /* .get_tensor_async        = */ NULL,
+        /* .cpy_tensor_async        = */ NULL,
+        /* .synchronize             = */ NULL,
+        /* .graph_plan_create       = */ NULL,
+        /* .graph_plan_free         = */ NULL,
+        /* .graph_plan_compute      = */ NULL,
+        /* .graph_compute           = */ ggml_backend_graph_compute,
+        /* .supports_op             = */ ggml_backend_mpi_supports_op,
+};
+
+
+std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
+    static bool has_init = false;
+    if (!has_init) {
+        ggml_mpi_backend_init();
+        has_init = true;
+    }
+    std::vector<ggml_mpi_device> devices;
+    int s;
+    MPI_Comm_size(MPI_COMM_WORLD, &s);
+    devices.resize(s);
+    for (int i = 0; i < s; i++) {
+        devices[i] = ggml_mpi_device{
+                i,
+                ggml_mpi_init(),
+                ("MPI_COMM_WORLD:" + std::to_string(i)).c_str(),
+                1
+        };
+    }
+    return devices;
+}
+
+ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft) {
+    auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{
+            /* .iface    = */ buft->iface,
+            /* .context  = */ buft->context,
+    };
+
+    return ggml_backend_wrapped_buffer_type;
+}
+
+ggml_backend_t ggml_backend_mpi_init(int index) {
+    auto *mpi_backend = new ggml_backend {
+            /* .interface = */ mpi_backend_i,
+            /* .context   = */ ggml_mpi_init(),
+    };
+
+    return mpi_backend;
+}
+
+static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
+    GGML_UNUSED(params);
+    return ggml_backend_mpi_init(intptr_t(user_data));
+}
+
+
+
+ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) {
+    return ggml_backend_cpu_buffer_type();
+}
+
+extern "C" GGML_CALL int ggml_backend_mpi_reg_devices();
+
+int ggml_backend_mpi_reg_devices() {
+    auto devices = ggml_mpi_available_devices_internal();
+    for (const auto & device : devices) {
+        ggml_backend_register(
+                device.name,
+                ggml_backend_reg_mpi_init,
+                ggml_backend_mpi_buffer_type(device.index),
+                reinterpret_cast<void *>(intptr_t(device.index))
+        );
+    }
+    return devices.size();
+}
+
+
+
+
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 62b15faef..2a0c5809c 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -1,6 +1,8 @@
 #pragma once
 #include <stdint.h>
 #include <stddef.h>
+#include "ggml.h"
+#include "ggml-backend.h"
 
 struct ggml_context;
 struct ggml_tensor;
@@ -49,6 +51,11 @@ void ggml_mpi_backend_free(void);
  */
 struct ggml_mpi_context * ggml_mpi_init(void);
 
+void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
+
+GGML_API ggml_backend_t ggml_backend_mpi_init(int index);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);
+
 /**
  * Create a new context by splitting the given context's
  * communicator, creating a "sub-communicator." This is a collective
@@ -194,6 +201,18 @@ void ggml_mpi_graph_compute_post(
              struct ggml_cgraph * gf,
                             int   n_layers);
 
+// BACKEND V2
+
+struct ggml_mpi_device {
+    int index;
+    struct ggml_mpi_context * ctx_mpi;
+    const char * name;
+    int subgroupSize;
+};
+
+#define MPI_BACKEND_NAME "MPI"
+GGML_CALL int ggml_backend_mpi_reg_devices();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml.h b/ggml.h
index ab26c8f59..a4efe792d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -379,6 +379,7 @@ extern "C" {
         GGML_BACKEND_TYPE_CPU = 0,
         GGML_BACKEND_TYPE_GPU = 10,
         GGML_BACKEND_TYPE_GPU_SPLIT = 20,
+        GGML_BACKEND_TYPE_MPI_SPLIT = 30,
     };
 
     // model file types
diff --git a/llama.cpp b/llama.cpp
index 2bdb38434..444c99e58 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4089,6 +4089,18 @@ static bool llm_load_tensors(
         }
     }
 
+#ifdef GGML_USE_MPI
+    for (int64_t i = 0; i < n_layer; i++) {
+        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
+                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
+    }
+
+    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
+                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
+    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
+                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+#endif
+
     // count used buffer types
     std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
     buft_layer_count[model.buft_input.buft]++;
@@ -4965,6 +4977,12 @@ static bool llm_load_tensors(
                 mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
             }
         }
+
+#ifdef GGML_USE_MPI
+        if (buf == nullptr) {
+            continue;
+        }
+#endif
         if (buf == nullptr) {
             throw std::runtime_error("failed to allocate buffer");
         }
@@ -12978,6 +12996,19 @@ struct llama_context * llama_new_context_with_model(
             }
             ctx->backends.push_back(backend);
         }
+#endif
+
+#ifdef GGML_USE_MPI
+        // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
+        ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu);
+        if (backend == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
+            llama_free(ctx);
+            return nullptr;
+        }
+        ctx->backends.push_back(backend);
+
+
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {

From 968cefb4a9c430c09b2b7a4df9dbc24b74efe593 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 19 Feb 2024 12:21:48 -0600
Subject: [PATCH 20/35] Wrap backends with MPI backend

---
 ggml-mpi.cpp | 124 +++++++++++++++++++--------------------------------
 ggml-mpi.h   |   9 ++--
 llama.cpp    |  50 +++++++++------------
 llama.h      |   3 ++
 4 files changed, 74 insertions(+), 112 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index b43dd96d1..3d2fc829e 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -22,6 +22,7 @@ struct ggml_mpi_context {
     int layer_end;
     struct ggml_tensor *inp0;
     std::string name;
+    struct ggml_backend * wrapped_backend;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -247,8 +248,6 @@ void ggml_mpi_scatter_layers(
 }
 
 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int   n_layers) {
-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
 
     struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
     if (inp_tokens == NULL) {
@@ -286,73 +285,22 @@ void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml
     }
 
 
-    {
-
-
-        //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
-
-        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
-
-        //const int il0 =               (mpi_idx + 0) * n_per_node;
-        //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
-        int il0 = ctx_mpi->layer_start;
-        int il1 = MIN(n_layers, ctx_mpi->layer_end);
-
-        char name_l0[GGML_MAX_NAME];
-        char name_l1[GGML_MAX_NAME];
-
-        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
-        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
-
-        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
-        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
-
-        if (idx_l0 < 0 || idx_l1 < 0) {
-            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
-            return;
-        }
-
-        // attach the input data to all nodes that need it
-        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
-        for (int i = idx_l0; i < idx_l1; i++) {
-            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[0] =  inp0;
-            }
-            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[1] =  inp0;
-            }
-        }
-
-        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
-        for (int i = 1; i < idx_l1 - idx_l0; i++) {
-            gf->nodes[i] = gf->nodes[idx_l0 + i];
-        }
-
-        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
-        if (mpi_idx != 0) {
-            gf->nodes[0]->op = GGML_OP_NONE;
-        }
-
-        gf->n_nodes = idx_l1 - idx_l0;
-
-    }
 }
 
 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
+             struct ggml_cgraph * gf) {
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
-    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    struct ggml_tensor * inp_tokens = gf->nodes[0];
     if (inp_tokens == NULL) {
         fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
         return;
     }
 
-    struct ggml_tensor * inp0 = ctx_mpi->inp0;
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
     if (inp0 == NULL) {
         fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
         return;
@@ -381,9 +329,7 @@ void ggml_mpi_graph_compute_pre(
 
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    UNUSED(n_layers);
+             struct ggml_cgraph * gf) {
 
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
@@ -396,9 +342,24 @@ void ggml_mpi_graph_compute_post(
 
 // BACKEND V2
 
+GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+
+    struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
+
+    ggml_mpi_graph_compute_pre(ctx, cgraph);
+
+    ggml_backend_t wrapped_backend = ctx->wrapped_backend;
+    bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
+
+    ggml_mpi_graph_compute_post(ctx, cgraph);
+
+    return ret;
+}
+
+
 static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
     auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
-    return ctx->name.c_str();
+    return ctx->wrapped_backend->iface.get_name(backend);
 }
 
 static void ggml_backend_mpi_free(ggml_backend_t backend) {
@@ -427,20 +388,6 @@ GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const
     GGML_UNUSED(backend);
 }
 
-static struct ggml_backend_i mpi_backend_i = {
-        /* .get_name                = */ ggml_backend_mpi_name,
-        /* .free                    = */ ggml_backend_mpi_free,
-        /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
-        /* .set_tensor_async        = */ NULL,
-        /* .get_tensor_async        = */ NULL,
-        /* .cpy_tensor_async        = */ NULL,
-        /* .synchronize             = */ NULL,
-        /* .graph_plan_create       = */ NULL,
-        /* .graph_plan_free         = */ NULL,
-        /* .graph_plan_compute      = */ NULL,
-        /* .graph_compute           = */ ggml_backend_graph_compute,
-        /* .supports_op             = */ ggml_backend_mpi_supports_op,
-};
 
 
 std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
@@ -473,23 +420,42 @@ ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type
     return ggml_backend_wrapped_buffer_type;
 }
 
-ggml_backend_t ggml_backend_mpi_init(int index) {
+ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend) {
+
+    struct ggml_backend_i mpi_backend_i = {
+            /* .get_name                = */ wrapped_backend->iface.get_name,
+            /* .free                    = */ ggml_backend_mpi_free,
+            /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
+            /* .set_tensor_async        = */ NULL,
+            /* .get_tensor_async        = */ NULL,
+            /* .cpy_tensor_async        = */ NULL,
+            /* .synchronize             = */ NULL,
+            /* .graph_plan_create       = */ NULL,
+            /* .graph_plan_free         = */ NULL,
+            /* .graph_plan_compute      = */ NULL,
+            /* .graph_compute           = */ ggml_backend_mpi_graph_compute,
+            /* .supports_op             = */ ggml_backend_mpi_supports_op,
+    };
+
+    ggml_mpi_context * ctx = ggml_mpi_init();
+    ctx->wrapped_backend = wrapped_backend;
     auto *mpi_backend = new ggml_backend {
             /* .interface = */ mpi_backend_i,
-            /* .context   = */ ggml_mpi_init(),
+            /* .context   = */ ctx,
     };
 
     return mpi_backend;
 }
 
 static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
+    // TODO check what the parameters are for. Could use it to setup the MPI comms and routes?
     GGML_UNUSED(params);
-    return ggml_backend_mpi_init(intptr_t(user_data));
+    return ggml_backend_mpi_init(ggml_backend_cpu_init());
 }
 
 
 
-ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) {
+ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type() {
     return ggml_backend_cpu_buffer_type();
 }
 
@@ -501,7 +467,7 @@ int ggml_backend_mpi_reg_devices() {
         ggml_backend_register(
                 device.name,
                 ggml_backend_reg_mpi_init,
-                ggml_backend_mpi_buffer_type(device.index),
+                ggml_backend_mpi_buffer_type(),
                 reinterpret_cast<void *>(intptr_t(device.index))
         );
     }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 2a0c5809c..c72ec0444 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -53,7 +53,6 @@ struct ggml_mpi_context * ggml_mpi_init(void);
 
 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
 
-GGML_API ggml_backend_t ggml_backend_mpi_init(int index);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);
 
 /**
@@ -185,8 +184,7 @@ void ggml_mpi_scatter_layers(
  */
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);
 
 /**
  * Sends the output tensor to the next node for processing
@@ -198,8 +196,7 @@ void ggml_mpi_graph_compute_pre(
  */
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);
 
 // BACKEND V2
 
@@ -213,6 +210,8 @@ struct ggml_mpi_device {
 #define MPI_BACKEND_NAME "MPI"
 GGML_CALL int ggml_backend_mpi_reg_devices();
 
+GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llama.cpp b/llama.cpp
index 444c99e58..edf2a03cf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4090,15 +4090,15 @@ static bool llm_load_tensors(
     }
 
 #ifdef GGML_USE_MPI
-    for (int64_t i = 0; i < n_layer; i++) {
-        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
-                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
-    }
-
-    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
-                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
-    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
-                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+//    for (int64_t i = 0; i < n_layer; i++) {
+//        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
+//                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
+//    }
+//
+//    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
+//                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
+//    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
+//                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
 #endif
 
     // count used buffer types
@@ -8764,10 +8764,7 @@ static void llama_graph_compute(
         llama_context & lctx,
           ggml_cgraph * gf,
                   int   n_threads) {
-#ifdef GGML_USE_MPI
-    const int64_t n_layer = lctx.model.hparams.n_layer;
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
+
 
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(lctx.backend_metal)) {
@@ -8783,10 +8780,7 @@ static void llama_graph_compute(
     ggml_backend_sched_graph_compute_async(lctx.sched, gf);
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
-#endif
+    
 }
 
 // decode a batch of tokens by evaluating the transformer
@@ -12619,6 +12613,7 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
+            static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -12998,18 +12993,7 @@ struct llama_context * llama_new_context_with_model(
         }
 #endif
 
-#ifdef GGML_USE_MPI
-        // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
-        ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu);
-        if (backend == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
-            llama_free(ctx);
-            return nullptr;
-        }
-        ctx->backends.push_back(backend);
 
-
-#endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -13018,6 +13002,16 @@ struct llama_context * llama_new_context_with_model(
         }
         ctx->backends.push_back(ctx->backend_cpu);
 
+#ifdef GGML_USE_MPI
+
+        for(auto & backend : ctx->backends) {
+            backend = ggml_backend_mpi_init(backend);
+
+        }
+
+        ctx->backend_cpu = ctx->backends.back();
+#endif
+
         if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
diff --git a/llama.h b/llama.h
index 818056064..2f2e775ca 100644
--- a/llama.h
+++ b/llama.h
@@ -202,6 +202,9 @@ extern "C" {
     };
 
     struct llama_model_params {
+        // Array of layers to allocate to each node
+        int32_t* n_node_layers;
+
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 

From bc935450057bf6d0e5a2d0bded92ae65d6ac368e Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 26 Feb 2024 19:12:16 -0600
Subject: [PATCH 21/35] Allow MPI backend to wrap multiple backends

---
 ggml-mpi.cpp | 208 +++++++++++++++++++++++++++++++++++++++++++++------
 ggml-mpi.h   |   2 +-
 llama.cpp    |  23 +++---
 3 files changed, 202 insertions(+), 31 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 3d2fc829e..8fc1f8d14 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -23,6 +23,8 @@ struct ggml_mpi_context {
     struct ggml_tensor *inp0;
     std::string name;
     struct ggml_backend * wrapped_backend;
+    std::vector<ggml_backend_t> backends;
+    ggml_backend_sched_t scheduler;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -35,7 +37,7 @@ void ggml_mpi_backend_free(void) {
 }
 
 struct ggml_mpi_context * ggml_mpi_init(void) {
-    auto * ctx = static_cast<ggml_mpi_context *>(calloc(1, sizeof(struct ggml_mpi_context)));
+    auto * ctx = new ggml_mpi_context;
 
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
@@ -300,7 +302,7 @@ void ggml_mpi_graph_compute_pre(
         return;
     }
 
-    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
+    struct ggml_tensor * inp0 = gf->nodes[0];
     if (inp0 == NULL) {
         fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
         return;
@@ -342,24 +344,108 @@ void ggml_mpi_graph_compute_post(
 
 // BACKEND V2
 
+struct ggml_backend_mpi_buffer_type_context {
+    std::string name;
+    ggml_backend_buffer_type_t wrapped_buffer;
+};
+
+GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft);
+
 GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
 
     struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
 
     ggml_mpi_graph_compute_pre(ctx, cgraph);
 
-    ggml_backend_t wrapped_backend = ctx->wrapped_backend;
-    bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
+    for (auto *curr_backend : ctx->backends) {
+        if (ggml_backend_is_cpu(curr_backend)) {
+            // use host buffers for the CPU backend compute buffer
+            backend_buft.push_back(ggml_backend_cpu_buffer_type());
+        } else {
+            backend_buft.push_back(ggml_backend_get_default_buffer_type(curr_backend));
+        }
+    }
+
+//    ggml_backend_t wrapped_backend = ctx->wrapped_backend;
+//    bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
+    printf("Running MPI backend\n");
+
+    std::vector<std::pair<ggml_backend_buffer_type_t, std::vector<ggml_backend_buffer_type_t>> > old_buffs(cgraph->n_nodes);
+    std::vector<ggml_backend_buffer_type_t> old_view_buffs(cgraph->n_nodes);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        old_buffs.push_back({cgraph->nodes[i]->buffer->buft,{}});
+        if (cgraph->nodes[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+            cgraph->nodes[i]->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) cgraph->nodes[i]->buffer->buft->context)->wrapped_buffer;
+            printf("Unwrapped buffer: %s\n", cgraph->nodes[i]->buffer->buft->iface.get_name(cgraph->nodes[i]->buffer->buft));
+        }
+
+        for (auto & src : cgraph->nodes[i]->src) {
+            if (src == nullptr) {
+                break;
+            }
+            old_buffs[i].second.push_back(src->buffer->buft);
+            if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                src->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) src->buffer->buft->context)->wrapped_buffer;
+                printf("Unwrapped buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+            }
+        }
+
+        auto *src = cgraph->nodes[i]->view_src;
+        if(src != nullptr && src->buffer->buft != nullptr){
+            old_view_buffs[i] = src->buffer->buft;
+            if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                src->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) src->buffer->buft->context)->wrapped_buffer;
+                printf("Unwrapped view buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+            }
+        }
+    }
+
+
+    std::vector<ggml_backend_buffer_type_t > old_buffs_leaves;
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        old_buffs_leaves.push_back(cgraph->leafs[i]->buffer->buft);
+        if (cgraph->leafs[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+            cgraph->leafs[i]->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) cgraph->leafs[i]->buffer->buft->context)->wrapped_buffer;
+            printf("Unwrapped buffer: %s\n", cgraph->leafs[i]->buffer->buft->iface.get_name(cgraph->leafs[i]->buffer->buft));
+        }
+    }
+
+    ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), cgraph->n_nodes);
+
+
+    printf("Created new scheduler\n");
+    ggml_backend_sched_init_measure(sched, cgraph);
+    printf("Beginning sched graph compute\n");
+    ggml_backend_sched_graph_compute(sched, cgraph);
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        cgraph->nodes[i]->buffer->buft = old_buffs[i].first;
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (cgraph->nodes[i]->src[j] == nullptr) {
+                break;
+            }
+            cgraph->nodes[i]->src[j]->buffer->buft = old_buffs[i].second[j];
+        }
+        if(cgraph->nodes[i]->view_src != nullptr && cgraph->nodes[i]->view_src->buffer->buft != nullptr) {
+            cgraph->nodes[i]->view_src->buffer->buft = old_view_buffs[i];
+        }
+
+    }
+
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        cgraph->leafs[i]->buffer->buft = old_buffs_leaves[i];
+    }
+
 
     ggml_mpi_graph_compute_post(ctx, cgraph);
 
-    return ret;
+    return true;
 }
 
 
 static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
-    return ctx->wrapped_backend->iface.get_name(backend);
+    return "MPI";
 }
 
 static void ggml_backend_mpi_free(ggml_backend_t backend) {
@@ -372,7 +458,9 @@ static void ggml_backend_mpi_free(ggml_backend_t backend) {
 }
 
 static ggml_backend_buffer_type_t ggml_backend_mpi_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_cpu_buffer_type();
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+
+    return ggml_backend_mpi_wrap_buffer(ctx->backends.back()->iface.get_default_buffer_type(ctx->backends.back()));
 }
 
 GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -411,19 +499,98 @@ std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
     return devices;
 }
 
-ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft) {
-    auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{
-            /* .iface    = */ buft->iface,
-            /* .context  = */ buft->context,
+
+
+GGML_CALL bool ggml_backend_is_mpi(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_mpi_name;
+}
+
+
+GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+
+    return strdup(((ctx->name + ":") + std::string(ctx->wrapped_buffer->iface.get_name(ctx->wrapped_buffer))).c_str());
+}
+
+GGML_CALL static ggml_backend_buffer_t ggml_backend_mpi_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+    ggml_backend_buffer_t buf = ctx->wrapped_buffer->iface.alloc_buffer(ctx->wrapped_buffer, size);
+    buf->buft = ggml_backend_mpi_wrap_buffer(buf->buft);
+    return buf;
+}
+
+GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+    return ctx->wrapped_buffer->iface.get_alignment(ctx->wrapped_buffer);
+}
+
+GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+    return ctx->wrapped_buffer->iface.get_max_size(ctx->wrapped_buffer);
+}
+
+GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+    return ctx->wrapped_buffer->iface.get_alloc_size(ctx->wrapped_buffer, tensor);
+}
+
+GGML_CALL static bool ggml_backend_mpi_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return backend != nullptr && ggml_backend_is_mpi(backend);
+}
+
+GGML_CALL static bool ggml_backend_mpi_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+
+    return ctx->wrapped_buffer->iface.is_host(ctx->wrapped_buffer);
+}
+
+
+static std::map<ggml_backend_buffer_type_t, ggml_backend_buffer_type_t> cached_wrappers;
+
+static std::map<ggml_backend_t *, ggml_backend_t> cached_backends;
+
+
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft) {
+
+    if (cached_wrappers.find(buft) != cached_wrappers.end()) {
+        return cached_wrappers[buft];
+    }
+
+    ggml_backend_buffer_type_i ggml_backend_mpi_buffer_type_interface = {
+            /* .get_name         = */ ggml_backend_mpi_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_mpi_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_mpi_buffer_type_get_alignment,
+            /* .get_max_size     = */ (buft->iface.get_max_size != nullptr ) ? ggml_backend_mpi_buffer_type_get_max_size : nullptr,
+            /* .get_alloc_size   = */ (buft->iface.get_alloc_size != nullptr ) ? ggml_backend_mpi_buffer_type_get_alloc_size : nullptr,
+            /* .supports_backend = */ ggml_backend_mpi_buffer_type_supports_backend,
+            /* .is_host          = */ (buft->iface.is_host != nullptr ) ? ggml_backend_mpi_buffer_type_is_host : nullptr,
     };
 
+    auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{
+            /* .iface    = */ ggml_backend_mpi_buffer_type_interface,
+            /* .context  = */ new ggml_backend_mpi_buffer_type_context{"MPI",buft},
+    };
+
+    cached_wrappers[buft] = ggml_backend_wrapped_buffer_type;
+
     return ggml_backend_wrapped_buffer_type;
 }
 
-ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend) {
+ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t num_backends) {
+
+    if (cached_backends.find(wrapped_backends) != cached_backends.end()) {
+        return cached_backends[wrapped_backends];
+    }
+
+    ggml_mpi_context * ctx = ggml_mpi_init();
+    std::vector<ggml_backend_t> wrapped_backends_v;
+    for (size_t i = 0; i < num_backends; i++) {
+        wrapped_backends_v.push_back(wrapped_backends[i]);
+    }
+    ctx->backends = wrapped_backends_v;
 
     struct ggml_backend_i mpi_backend_i = {
-            /* .get_name                = */ wrapped_backend->iface.get_name,
+            /* .get_name                = */ ggml_backend_mpi_name,
             /* .free                    = */ ggml_backend_mpi_free,
             /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
             /* .set_tensor_async        = */ NULL,
@@ -437,27 +604,26 @@ ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend) {
             /* .supports_op             = */ ggml_backend_mpi_supports_op,
     };
 
-    ggml_mpi_context * ctx = ggml_mpi_init();
-    ctx->wrapped_backend = wrapped_backend;
     auto *mpi_backend = new ggml_backend {
             /* .interface = */ mpi_backend_i,
             /* .context   = */ ctx,
     };
 
+    cached_backends[wrapped_backends] = mpi_backend;
+
     return mpi_backend;
 }
 
 static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
     // TODO check what the parameters are for. Could use it to setup the MPI comms and routes?
     GGML_UNUSED(params);
-    return ggml_backend_mpi_init(ggml_backend_cpu_init());
+    auto * v = new std::vector<ggml_backend_t>();
+    v->push_back(ggml_backend_cpu_init());
+    return ggml_backend_mpi_init(v->data(), 1);
 }
 
 
 
-ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type() {
-    return ggml_backend_cpu_buffer_type();
-}
 
 extern "C" GGML_CALL int ggml_backend_mpi_reg_devices();
 
@@ -467,7 +633,7 @@ int ggml_backend_mpi_reg_devices() {
         ggml_backend_register(
                 device.name,
                 ggml_backend_reg_mpi_init,
-                ggml_backend_mpi_buffer_type(),
+                ggml_backend_mpi_wrap_buffer(ggml_backend_cpu_buffer_type()),
                 reinterpret_cast<void *>(intptr_t(device.index))
         );
     }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index c72ec0444..cbe5a51c0 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -210,7 +210,7 @@ struct ggml_mpi_device {
 #define MPI_BACKEND_NAME "MPI"
 GGML_CALL int ggml_backend_mpi_reg_devices();
 
-GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend);
+GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t num_backends);
 
 #ifdef __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index edf2a03cf..f2f052bbf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1472,6 +1472,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
     if (buft == nullptr) {
         buft = ggml_backend_cpu_buffer_type();
     }
+
+#if defined(GGML_USE_MPI)
+    buft = ggml_backend_mpi_wrap_buffer(buft);
+#endif
     return buft;
 
     GGML_UNUSED(host_buffer);
@@ -1523,6 +1527,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
     if (buft == nullptr) {
         buft = llama_default_buffer_type_offload(fallback_gpu);
     }
+
+#if defined(GGML_USE_MPI)
+    buft = ggml_backend_mpi_wrap_buffer(buft);
+#endif
+
     return buft;
 
     GGML_UNUSED(tensor_split);
@@ -4978,11 +4987,6 @@ static bool llm_load_tensors(
             }
         }
 
-#ifdef GGML_USE_MPI
-        if (buf == nullptr) {
-            continue;
-        }
-#endif
         if (buf == nullptr) {
             throw std::runtime_error("failed to allocate buffer");
         }
@@ -13004,12 +13008,13 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef GGML_USE_MPI
 
-        for(auto & backend : ctx->backends) {
-            backend = ggml_backend_mpi_init(backend);
+        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size())};
 
-        }
 
-        ctx->backend_cpu = ctx->backends.back();
+
+//        ctx->backend_cpu = ctx->backends.back();
+        ctx->backends.push_back(ctx->backend_cpu);
+
 #endif
 
         if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {

From 942ce843f8407f32228d2dbf3507357230b36a22 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 12 Mar 2024 11:32:31 -0500
Subject: [PATCH 22/35] Working MPI backend implementation

---
 ggml-mpi.cpp | 517 +++++++++++++++++++++++++++++++++++++++++++--------
 ggml-mpi.h   |  11 +-
 2 files changed, 444 insertions(+), 84 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 8fc1f8d14..a16cc48b7 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -25,6 +25,7 @@ struct ggml_mpi_context {
     struct ggml_backend * wrapped_backend;
     std::vector<ggml_backend_t> backends;
     ggml_backend_sched_t scheduler;
+    bool remote;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -42,6 +43,7 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
     ctx->comm = MPI_COMM_WORLD;
+    ctx->remote = false;
 
     return ctx;
 }
@@ -76,10 +78,13 @@ void ggml_mpi_eval_init(
                 int8_t          **  logits) {
 
 
+//    fprintf(stderr, "Beginning eval init on rank %d\n", ctx_mpi->rank);
     MPI_Barrier(ctx_mpi->comm);
     int32_t old_n_tokens = *n_tokens;
     MPI_Bcast(n_tokens, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 
+//    fprintf(stderr, "Node %d, old_n_tokens: %d, new n_tokens: %d\n", ctx_mpi->rank, old_n_tokens, *n_tokens);
+
     // If what was passed in differs from what was broadcast,
     // we can't guarantee the allocated sizes are correct
     // TODO check how often this is done and if it's a problem,
@@ -160,21 +165,30 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
     return -1;
 }
 
-
-static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst, MPI_Comm comm) {
+static void ggml_mpi_tensor_send(const struct ggml_tensor * t, const void* data, int mpi_rank_dst, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
+//    fprintf(stderr, "Type: %d\n", t->type);
+
     switch (t->type) {
         case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
         case GGML_TYPE_F32: mpi_type = MPI_FLOAT;   break;
+        case GGML_TYPE_F16: mpi_type = MPI_INT16_T;   break;
         default: GGML_ASSERT(false && "not implemented");
     }
+    int rank;
+    MPI_Comm_rank(comm, &rank);
+//    fprintf(stderr, "Sending tensor %s (buffer %s) from %d to %d\n", t->name, ggml_backend_buffer_name(t->buffer), rank, mpi_rank_dst);
 
-    const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, comm);
+    const int retval = MPI_Send(data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, comm);
     GGML_ASSERT(retval == MPI_SUCCESS);
+
+}
+static void ggml_mpi_tensor_send(const struct ggml_tensor * t, int mpi_rank_dst, MPI_Comm comm) {
+    ggml_mpi_tensor_send(t, t->data, mpi_rank_dst, comm);
 }
 
-static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_Comm comm) {
+static void ggml_mpi_tensor_recv(const struct ggml_tensor * t, void * data, int mpi_rank_src, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
     switch (t->type) {
@@ -184,11 +198,18 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_C
     }
 
     MPI_Status status; UNUSED(status);
-    fprintf(stderr, "%s: tensor receive == null: %d\n", __func__, t->data == NULL);
-    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
+//    fprintf(stderr, "%s: tensor receive == null: %d\n", __func__, t->data == NULL);
+    int rank;
+    MPI_Comm_rank(comm, &rank);
+//    fprintf(stderr, "Receiving tensor %s (buffer %s) from %d at %d\n", t->name, ggml_backend_buffer_name(t->buffer), mpi_rank_src, rank);
+    const int retval = MPI_Recv(data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
 
+static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_Comm comm) {
+    ggml_mpi_tensor_recv(t, t->data, mpi_rank_src, comm);
+}
+
 uint16_t** ggml_mpi_split_range(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t start,
@@ -296,12 +317,6 @@ void ggml_mpi_graph_compute_pre(
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
-    struct ggml_tensor * inp_tokens = gf->nodes[0];
-    if (inp_tokens == NULL) {
-        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
-        return;
-    }
-
     struct ggml_tensor * inp0 = gf->nodes[0];
     if (inp0 == NULL) {
         fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
@@ -309,23 +324,24 @@ void ggml_mpi_graph_compute_pre(
     }
 
     if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            if (inp_tokens->data == NULL) {
-
-            }
-            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            fprintf(stderr, "%s:%d: receiving layer inp0\n", __func__, ctx_mpi->rank);
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
-        }
+//        ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
+//        if (mpi_rank == 1) {
+//            // the first node (1) receives the input tokens from the main node (0)
+//            if (inp_tokens->data == NULL) {
+//
+//            }
+//            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
+//        } else {
+//            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+//            fprintf(stderr, "%s:%d: receiving layer inp0\n", __func__, ctx_mpi->rank);
+//            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
+//        }
     } else if (mpi_size > 1) {
         // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
+//        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
 
         // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+//        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
     }
 }
 
@@ -338,27 +354,48 @@ void ggml_mpi_graph_compute_post(
 
     // send the output data to the next node
     if (mpi_rank > 0) {
-        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
+//        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
     }
 }
 
 // BACKEND V2
 
+struct ggml_backend_mpi_buffer_context {
+    ggml_backend_buffer_t wrapped_buffer;
+    int rank;
+};
+
 struct ggml_backend_mpi_buffer_type_context {
     std::string name;
     ggml_backend_buffer_type_t wrapped_buffer;
+    int rank;
 };
 
+GGML_CALL static const char * ggml_backend_mpi_buffer_name(ggml_backend_buffer_t buffer) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    return strdup((("MPI Buffer(Rank " + std::to_string(ctx->rank) + ", local rank " + std::to_string(rank) + "):") + std::string(ctx->wrapped_buffer->iface.get_name(ctx->wrapped_buffer))).c_str());
+}
+
 GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft);
 
 GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
 
     struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
 
-    ggml_mpi_graph_compute_pre(ctx, cgraph);
+
+//    if (ctx->remote) {
+//        return true;
+//    }
+
+
+//    ggml_mpi_graph_compute_pre(ctx, cgraph);
 
     std::vector<ggml_backend_buffer_type_t> backend_buft;
-    for (auto *curr_backend : ctx->backends) {
+    for (auto *curr_backend: ctx->backends) {
         if (ggml_backend_is_cpu(curr_backend)) {
             // use host buffers for the CPU backend compute buffer
             backend_buft.push_back(ggml_backend_cpu_buffer_type());
@@ -369,83 +406,202 @@ GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggm
 
 //    ggml_backend_t wrapped_backend = ctx->wrapped_backend;
 //    bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
-    printf("Running MPI backend\n");
+//    printf("Running MPI backend\n");
 
-    std::vector<std::pair<ggml_backend_buffer_type_t, std::vector<ggml_backend_buffer_type_t>> > old_buffs(cgraph->n_nodes);
+    std::vector<std::pair<ggml_backend_buffer_t, std::vector<ggml_backend_buffer_type_t>>> old_buffs(
+            cgraph->n_nodes);
     std::vector<ggml_backend_buffer_type_t> old_view_buffs(cgraph->n_nodes);
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        old_buffs.push_back({cgraph->nodes[i]->buffer->buft,{}});
-        if (cgraph->nodes[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-            cgraph->nodes[i]->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) cgraph->nodes[i]->buffer->buft->context)->wrapped_buffer;
-            printf("Unwrapped buffer: %s\n", cgraph->nodes[i]->buffer->buft->iface.get_name(cgraph->nodes[i]->buffer->buft));
-        }
 
-        for (auto & src : cgraph->nodes[i]->src) {
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        old_buffs[i].first = cgraph->nodes[i]->buffer;
+//        if (cgraph->nodes[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+//            cgraph->nodes[i]->buffer = ((ggml_backend_mpi_buffer_context *) cgraph->nodes[i]->buffer->context)->wrapped_buffer;
+////            printf("Unwrapped buffer: %s\n", cgraph->nodes[i]->buffer->buft->iface.get_name(cgraph->nodes[i]->buffer->buft));
+//        }
+
+        for (auto &src: cgraph->nodes[i]->src) {
             if (src == nullptr) {
                 break;
             }
             old_buffs[i].second.push_back(src->buffer->buft);
+//            if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+//                src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
+////                printf("Unwrapped buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+//            }
+        }
+
+        auto *src = cgraph->nodes[i]->view_src;
+        if (src != nullptr) {
+//            fprintf(stderr, "View src is not null, src=%s, src buffer=%s\n", src->name, ggml_backend_buffer_name(src->buffer));
+            if (src->buffer->buft != nullptr) {
+//                fprintf(stderr, "View src buffer type is not null, buft=%s\n", ggml_backend_buft_name(src->buffer->buft));
+                old_view_buffs[i] = src->buffer->buft;
+//                if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+//                    src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
+//                    printf("Unwrapped view buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+//                }
+            } else {
+//                old_view_buffs[i] = ggml_backend_mpi_wrap_buffer_type(ggml_backend_cpu_buffer_type());
+//                ggml_backend_mpi_buffer_type_set_rank(old_view_buffs[i], ((ggml_backend_mpi_buffer_context*)src->buffer->context)->rank);
+            }
+        } else {
+//            fprintf(stderr, "OLD VIEW BUFF IS NULL\n");
+        }
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        if (cgraph->nodes[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+            auto usage = cgraph->nodes[i]->buffer->usage;
+            cgraph->nodes[i]->buffer = ((ggml_backend_mpi_buffer_context *) cgraph->nodes[i]->buffer->context)->wrapped_buffer;
+            cgraph->nodes[i]->buffer->usage = usage;
+//            printf("Unwrapped buffer: %s\n", cgraph->nodes[i]->buffer->buft->iface.get_name(cgraph->nodes[i]->buffer->buft));
+        }
+
+        for (auto &src: cgraph->nodes[i]->src) {
+            if (src == nullptr) {
+                break;
+            }
+//            old_buffs[i].second.push_back(src->buffer->buft);
             if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-                src->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) src->buffer->buft->context)->wrapped_buffer;
-                printf("Unwrapped buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+                auto usage = src->buffer->usage;
+                src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
+                src->buffer->usage = usage;
+//                printf("Unwrapped buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
             }
         }
 
         auto *src = cgraph->nodes[i]->view_src;
-        if(src != nullptr && src->buffer->buft != nullptr){
-            old_view_buffs[i] = src->buffer->buft;
-            if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-                src->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) src->buffer->buft->context)->wrapped_buffer;
-                printf("Unwrapped view buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+        if (src != nullptr) {
+//            fprintf(stderr, "View src is not null, src=%s, src buffer=%s\n", src->name, ggml_backend_buffer_name(src->buffer));
+            if (src->buffer->buft != nullptr) {
+//                fprintf(stderr, "View src buffer type is not null, buft=%s\n", ggml_backend_buft_name(src->buffer->buft));
+//                old_view_buffs[i] = src->buffer->buft;
+                if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                    auto usage = src->buffer->usage;
+
+                    src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
+                    src->buffer->usage = usage;
+//                    printf("Unwrapped view buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+                }
+            } else {
+//                old_view_buffs[i] = ggml_backend_mpi_wrap_buffer_type(ggml_backend_cpu_buffer_type());
+//                ggml_backend_mpi_buffer_type_set_rank(old_view_buffs[i], ((ggml_backend_mpi_buffer_context*)src->buffer->context)->rank);
             }
+        } else {
+//            fprintf(stderr, "OLD VIEW BUFF IS NULL\n");
         }
     }
 
+//    fprintf(stderr, "Original n_leafs: %d\n", cgraph->n_leafs);
 
-    std::vector<ggml_backend_buffer_type_t > old_buffs_leaves;
+    std::vector<ggml_backend_buffer_type_t> old_buffs_leaves;
     for (int i = 0; i < cgraph->n_leafs; i++) {
+//        fprintf(stderr, "Pushing leaf %s\n", cgraph->leafs[i]->name);
         old_buffs_leaves.push_back(cgraph->leafs[i]->buffer->buft);
         if (cgraph->leafs[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
             cgraph->leafs[i]->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) cgraph->leafs[i]->buffer->buft->context)->wrapped_buffer;
-            printf("Unwrapped buffer: %s\n", cgraph->leafs[i]->buffer->buft->iface.get_name(cgraph->leafs[i]->buffer->buft));
+//            printf("Unwrapped buffer: %s\n", cgraph->leafs[i]->buffer->buft->iface.get_name(cgraph->leafs[i]->buffer->buft));
         }
     }
 
-    ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), cgraph->n_nodes);
 
+    if (!ctx->remote) {
+        ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(),
+                                                            ctx->backends.size(), cgraph->n_nodes);
 
-    printf("Created new scheduler\n");
-    ggml_backend_sched_init_measure(sched, cgraph);
-    printf("Beginning sched graph compute\n");
-    ggml_backend_sched_graph_compute(sched, cgraph);
+//    printf("Created new scheduler\n");
+        ggml_backend_sched_init_measure(sched, cgraph);
+//    printf("Beginning sched graph compute\n");
+        ggml_backend_sched_graph_compute(sched, cgraph);
+
+        ggml_backend_sched_free(sched);
+
+    }
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        cgraph->nodes[i]->buffer->buft = old_buffs[i].first;
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (cgraph->nodes[i]->src[j] == nullptr) {
+//        fprintf(stderr, "Wrapping buffer %s for node %s\n", cgraph->nodes[i]->name, ggml_backend_buffer_name(cgraph->nodes[i]->buffer));
+        cgraph->nodes[i]->buffer = ggml_backend_mpi_wrap_buffer(cgraph->nodes[i]->buffer);
+
+//        fprintf(stderr, "Setting buffer ranks for node %s with old buff %s\n", cgraph->nodes[i]->name, ggml_backend_buffer_name(old_buffs[i].first));
+
+        ggml_backend_mpi_buffer_set_rank(cgraph->nodes[i]->buffer, ((ggml_backend_mpi_buffer_context*)old_buffs[i].first->context)->rank);
+
+//        fprintf(stderr, "New buffer rank for node %s: %d\n", cgraph->nodes[i]->name, ctx->rank);
+
+
+        for (int iter = 0; iter < GGML_MAX_SRC; iter++) {
+            auto* j = cgraph->nodes[i]->src[iter];
+            if (j == nullptr) {
                 break;
             }
-            cgraph->nodes[i]->src[j]->buffer->buft = old_buffs[i].second[j];
+
+            if (j->buffer->iface.get_name == ggml_backend_mpi_buffer_name) {
+//                fprintf(stderr, "Skipping buffer ranks for src node %s with buffer type %s\n", j->name, j->buffer->buft->iface.get_name(j->buffer->buft));
+                continue;
+            }
+
+//            fprintf(stderr, "Setting buffer ranks for src node %s\n", j->name);
+
+
+//            fprintf(stderr, "Source buffer name: %s, buffer type name: %s\n",
+//                    j->buffer->iface.get_name(j->buffer),
+//                    j->buffer->buft->iface.get_name(j->buffer->buft));
+
+            j->buffer = ggml_backend_mpi_wrap_buffer(j->buffer);
+
+            ggml_backend_mpi_buffer_set_rank(j->buffer, ((ggml_backend_mpi_buffer_type_context*)old_buffs[i].second[iter]->context)->rank);
+//            j->buffer->buft = old_buffs[i].second[iter];
+
+//            fprintf(stderr, "New source buffer name: %s, buffer type name: %s\n",
+//                    j->buffer->iface.get_name(j->buffer),
+//                    j->buffer->buft->iface.get_name(j->buffer->buft));
+
+//            ggml_backend_mpi_buffer_type_set_rank(j->buffer->buft, ctx->rank);
         }
         if(cgraph->nodes[i]->view_src != nullptr && cgraph->nodes[i]->view_src->buffer->buft != nullptr) {
-            cgraph->nodes[i]->view_src->buffer->buft = old_view_buffs[i];
+//            fprintf(stderr, "View source %s (buffer name: %s), buffer type name: %s\n", cgraph->nodes[i]->view_src->name,
+//                    cgraph->nodes[i]->view_src->buffer->iface.get_name(cgraph->nodes[i]->view_src->buffer),
+//                    cgraph->nodes[i]->view_src->buffer->buft->iface.get_name(cgraph->nodes[i]->view_src->buffer->buft));
+
+//            fprintf(stderr, "Old view source buffer type name: %s\n",
+//                    old_view_buffs[i]->iface.get_name(old_view_buffs[i]));
+//            cgraph->nodes[i]->view_src->buffer = ggml_backend_mpi_wrap_buffer(cgraph->nodes[i]->view_src->buffer);
+            // WRONG, need to keep the source ranks from before compute
+//            fprintf(stderr, "View buff %s null\n", (old_view_buffs[i]->context == nullptr) ? " is " : "is NOT ");
+            if (old_view_buffs[i] != nullptr) {
+                if (old_view_buffs[i]->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                    ggml_backend_mpi_buffer_set_rank(cgraph->nodes[i]->view_src->buffer,
+                                                     ((ggml_backend_mpi_buffer_type_context *) old_view_buffs[i]->context)->rank);
+                } else {
+//                    ggml_backend_mpi_buffer_set_rank(cgraph->nodes[i]->view_src->buffer,
+//                                                     ctx->rank);
+                }
+            }
         }
 
     }
 
     for (int i = 0; i < cgraph->n_leafs; i++) {
-        cgraph->leafs[i]->buffer->buft = old_buffs_leaves[i];
+//        fprintf(stderr, "Wrapping leaf %s...\n", cgraph->leafs[i]->name);
+        cgraph->leafs[i]->buffer = ggml_backend_mpi_wrap_buffer(cgraph->leafs[i]->buffer);
+//        fprintf(stderr, "Wrapped leaf buffer: %s\n", ggml_backend_buffer_name(cgraph->leafs[i]->buffer));
+        ggml_backend_mpi_buffer_type_set_rank(cgraph->leafs[i]->buffer->buft, ctx->rank);
+//        fprintf(stderr, "Wrapped leaf after setting rank: %s\n", ggml_backend_buffer_name(cgraph->leafs[i]->buffer));
     }
 
 
-    ggml_mpi_graph_compute_post(ctx, cgraph);
+//    ggml_mpi_graph_compute_post(ctx, cgraph);
 
     return true;
 }
 
 
 static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
-    return "MPI";
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    return strdup(("MPI(Rank " + std::to_string(((ggml_mpi_context*)backend->context)->rank) + ", local rank " + std::to_string(rank) + ")").c_str());
 }
 
 static void ggml_backend_mpi_free(ggml_backend_t backend) {
@@ -459,8 +615,15 @@ static void ggml_backend_mpi_free(ggml_backend_t backend) {
 
 static ggml_backend_buffer_type_t ggml_backend_mpi_get_default_buffer_type(ggml_backend_t backend) {
     auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+    if (ctx->backends.empty()) {
+        auto * buff = ggml_backend_mpi_wrap_buffer_type(ggml_backend_cpu_buffer_type());
+        ggml_backend_mpi_buffer_type_set_rank(buff, ctx->rank);
+        return buff;
+    }
 
-    return ggml_backend_mpi_wrap_buffer(ctx->backends.back()->iface.get_default_buffer_type(ctx->backends.back()));
+    auto * buff = ggml_backend_mpi_wrap_buffer_type(ctx->backends.back()->iface.get_default_buffer_type(ctx->backends.back()));
+    ggml_backend_mpi_buffer_type_set_rank(buff, ctx->rank);
+    return buff;
 }
 
 GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -506,17 +669,35 @@ GGML_CALL bool ggml_backend_is_mpi(ggml_backend_t backend) {
 }
 
 
+
+
 GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft) {
     auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
 
-    return strdup(((ctx->name + ":") + std::string(ctx->wrapped_buffer->iface.get_name(ctx->wrapped_buffer))).c_str());
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    return strdup(((ctx->name + " Buffer Type(Rank " + std::to_string(ctx->rank) + ", local rank " + std::to_string(rank) + "):") + std::string(ctx->wrapped_buffer->iface.get_name(ctx->wrapped_buffer))).c_str());
 }
 
 GGML_CALL static ggml_backend_buffer_t ggml_backend_mpi_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+//    fprintf(stderr, "ALLOCATING NEW BUFFER FOR BUFFER_TYPE %s\n", ggml_backend_buft_name(buft));
+
     auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
-    ggml_backend_buffer_t buf = ctx->wrapped_buffer->iface.alloc_buffer(ctx->wrapped_buffer, size);
-    buf->buft = ggml_backend_mpi_wrap_buffer(buf->buft);
-    return buf;
+
+//    fprintf(stderr, "WRAPPED BUFFER_TYPE %s\n", ggml_backend_buft_name(ctx->wrapped_buffer));
+
+
+    auto* buffer = ggml_backend_mpi_wrap_buffer(ctx->wrapped_buffer->iface.alloc_buffer(ctx->wrapped_buffer, size));
+
+//    fprintf(stderr, "NEW BUFFER: %s, BUFFER_TYPE: %s\n", ggml_backend_buffer_name(buffer), ggml_backend_buft_name(buffer->buft));
+
+
+    ggml_backend_mpi_buffer_set_rank(buffer, ctx->rank);
+
+//    fprintf(stderr, "NEW BUFFER AFTER SETTING RANK: %s, BUFFER_TYPE: %s\n", ggml_backend_buffer_name(buffer), ggml_backend_buft_name(buffer->buft));
+
+    return buffer;
 }
 
 GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -530,31 +711,42 @@ GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_max_size(ggml_backend_b
 }
 
 GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
+//    fprintf(stderr, "GETTING ALLOC SIZE FOR TENSOR %s (%s) AND BUFFER TYPE %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buft_name(buft));
+
+
     auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
     return ctx->wrapped_buffer->iface.get_alloc_size(ctx->wrapped_buffer, tensor);
 }
 
 GGML_CALL static bool ggml_backend_mpi_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return backend != nullptr && ggml_backend_is_mpi(backend);
+    return backend != nullptr && ggml_backend_is_mpi(backend) && ((ggml_backend_mpi_buffer_type_context*) buft->context)->rank == ((ggml_mpi_context*)backend->context)->rank;
 }
 
 GGML_CALL static bool ggml_backend_mpi_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
 
-    return ctx->wrapped_buffer->iface.is_host(ctx->wrapped_buffer);
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    return ctx->rank == rank && ctx->wrapped_buffer->iface.is_host(ctx->wrapped_buffer);
 }
 
 
 static std::map<ggml_backend_buffer_type_t, ggml_backend_buffer_type_t> cached_wrappers;
 
+static std::map<ggml_backend_buffer_t, ggml_backend_buffer_t> cached_buffer_wrappers;
+
 static std::map<ggml_backend_t *, ggml_backend_t> cached_backends;
 
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer_type(ggml_backend_buffer_type_t buft) {
 
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft) {
-
-    if (cached_wrappers.find(buft) != cached_wrappers.end()) {
-        return cached_wrappers[buft];
-    }
+//    if (cached_wrappers.find(buft) != cached_wrappers.end()) {
+//        fprintf(stderr, "Returning cached buffer type with name %s\n", cached_wrappers[buft]->iface.get_name(cached_wrappers[buft]));
+//
+//        auto * ret = new ggml_backend_buffer_type;
+//        *ret = *cached_wrappers[buft];
+//        return ret;
+//    }
 
     ggml_backend_buffer_type_i ggml_backend_mpi_buffer_type_interface = {
             /* .get_name         = */ ggml_backend_mpi_buffer_type_name,
@@ -566,9 +758,11 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_b
             /* .is_host          = */ (buft->iface.is_host != nullptr ) ? ggml_backend_mpi_buffer_type_is_host : nullptr,
     };
 
+
+
     auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{
             /* .iface    = */ ggml_backend_mpi_buffer_type_interface,
-            /* .context  = */ new ggml_backend_mpi_buffer_type_context{"MPI",buft},
+            /* .context  = */ new ggml_backend_mpi_buffer_type_context{"MPI",buft, 0},
     };
 
     cached_wrappers[buft] = ggml_backend_wrapped_buffer_type;
@@ -576,26 +770,169 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_b
     return ggml_backend_wrapped_buffer_type;
 }
 
-ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t num_backends) {
 
-    if (cached_backends.find(wrapped_backends) != cached_backends.end()) {
-        return cached_backends[wrapped_backends];
+
+GGML_CALL static void * ggml_backend_mpi_buffer_get_base(ggml_backend_buffer_t buffer) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+    return ctx->wrapped_buffer->iface.get_base(ctx->wrapped_buffer);
+}
+
+GGML_CALL static void ggml_backend_mpi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+    return ctx->wrapped_buffer->iface.free_buffer(ctx->wrapped_buffer);
+}
+
+GGML_CALL static void ggml_backend_mpi_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+//    fprintf(stderr, "SETTING TENSOR WITHOUT MPI CALLS FOR %s (%s) AND TGT BUFFER %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer));
+    return ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
+}
+
+GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+    auto * type_ctx = (ggml_backend_mpi_buffer_type_context *) buffer->buft->context;
+
+
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int src_rank = ((ggml_backend_mpi_buffer_type_context*)tensor->buffer->buft->context)->rank;
+
+    if (rank != src_rank) {
+
+        ggml_mpi_tensor_recv(tensor, data, ((ggml_backend_mpi_buffer_type_context*)tensor->buffer->buft->context)->rank, MPI_COMM_WORLD);
+        return;
     }
 
+//    fprintf(stderr, "GETTING TENSOR WITH SRC RANK=RANK (%d) FOR TENSOR %s (%s) AND TGT BUFFER %s\n", src_rank, tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer));
+    ctx->wrapped_buffer->iface.get_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
+}
+
+GGML_CALL static bool ggml_backend_mpi_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+
+//    fprintf(stderr, "DOING LOCAL TENSOR COPY FOR SRC %s (%s) AND DST %s (%s) WITH TGT BUFFER %s\n", src->name, ggml_backend_buffer_name(src->buffer), dst->name, ggml_backend_buffer_name(dst->buffer), ggml_backend_buffer_name(buffer));
+
+    return ctx->wrapped_buffer->iface.cpy_tensor(ctx->wrapped_buffer, src, dst);
+}
+
+GGML_CALL static void ggml_backend_mpi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+    return ctx->wrapped_buffer->iface.clear(ctx->wrapped_buffer, value);
+}
+
+static struct ggml_backend_buffer_i mpi_backend_buffer_i = {
+        /* .get_name        = */ ggml_backend_mpi_buffer_name,
+        /* .free_buffer     = */ ggml_backend_mpi_buffer_free_buffer,
+        /* .get_base        = */ ggml_backend_mpi_buffer_get_base,
+        /* .init_tensor     = */ NULL, // no initialization required
+        /* .set_tensor      = */ ggml_backend_mpi_buffer_set_tensor,
+        /* .get_tensor      = */ ggml_backend_mpi_buffer_get_tensor,
+        /* .cpy_tensor      = */ ggml_backend_mpi_buffer_cpy_tensor,
+        /* .clear           = */ ggml_backend_mpi_buffer_clear,
+        /* .reset           = */ NULL,
+};
+
+GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_t buf) {
+
+//    if (buf->iface.get_name == mpi_backend_buffer_i.get_name) {
+//        return buf;
+//    }
+
+//    if (cached_buffer_wrappers.find(buf) != cached_buffer_wrappers.end()) {
+//        fprintf(stderr, "Returning cached buffer with name %s\n", cached_buffer_wrappers[buf]->iface.get_name(cached_buffer_wrappers[buf]));
+//        auto * ret = new ggml_backend_buffer;
+//        *ret = *cached_buffer_wrappers[buf];
+//        auto * ret_type = new ggml_backend_buffer_type;
+//        *ret_type = *ret->buft;
+//        ret->buft = ret_type;
+//        return ret;
+//    }
+
+
+    ggml_backend_buffer_type_t t = ggml_backend_mpi_wrap_buffer_type(buf->buft);
+
+    auto *buffer = new ggml_backend_buffer {
+            /* .interface = */ mpi_backend_buffer_i,
+            /* .buft      = */ t,
+            /* .context   = */ new ggml_backend_mpi_buffer_context{buf, ((ggml_backend_mpi_buffer_type_context*)t->context)->rank},
+            /* .size      = */ buf->size,
+            /* .usage     = */ buf->usage
+    };
+
+    cached_buffer_wrappers[buf] = buffer;
+
+
+
+    return buffer;
+}
+
+bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    int src_rank = ((ggml_backend_mpi_buffer_type_context*)src->buffer->buft->context)->rank;
+    int dst_rank = ((ggml_backend_mpi_buffer_type_context*)dst->buffer->buft->context)->rank;
+
+//    fprintf(stderr, "Running tensor async copy for src %s (buffer %s) and dst %s (buffer %s) with backend %s\n", src->name, ggml_backend_buffer_name(src->buffer), dst->name, ggml_backend_buffer_name(dst->buffer), backend->iface.get_name(backend));
+
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+
+    if (ctx->remote) {
+
+//        fprintf(stderr, "Skipping tensor copy for remote backend %s.\n", backend->iface.get_name(backend));
+        return true;
+    }
+
+    if (src_rank == dst_rank) {
+//        src->buffer->iface.cpy_tensor(src->buffer, src, dst);
+        return true;
+    }
+
+    if (src_rank == ctx->rank) {
+        ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
+    } else if (dst_rank == ctx->rank){
+        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
+    }
+    return true;
+
+}
+
+void ggml_backend_mpi_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * dst, const void* data, size_t offset, size_t size) {
+    int dst_rank = ((ggml_backend_mpi_buffer_type_context*)dst->buffer->buft->context)->rank;
+
+//    fprintf(stderr, "Running set tensor for dst %s (buffer %s) with backend %s\n", dst->name, ggml_backend_buffer_name(dst->buffer), backend->iface.get_name(backend));
+
+
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+
+    GGML_ASSERT(ctx->rank == dst_rank);
+
+    ggml_mpi_tensor_send(dst, data, ctx->rank, ctx->comm);
+
+
+}
+
+
+ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t num_backends, int rank) {
+
+
     ggml_mpi_context * ctx = ggml_mpi_init();
     std::vector<ggml_backend_t> wrapped_backends_v;
-    for (size_t i = 0; i < num_backends; i++) {
-        wrapped_backends_v.push_back(wrapped_backends[i]);
+    if (ctx->rank == rank) {
+        for (size_t i = 0; i < num_backends; i++) {
+            wrapped_backends_v.push_back(wrapped_backends[i]);
+        }
+    } else {
+        ctx->remote = true;
     }
     ctx->backends = wrapped_backends_v;
-
+    ctx->rank = rank;
     struct ggml_backend_i mpi_backend_i = {
             /* .get_name                = */ ggml_backend_mpi_name,
             /* .free                    = */ ggml_backend_mpi_free,
             /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
-            /* .set_tensor_async        = */ NULL,
+            /* .set_tensor_async        = */ ggml_backend_mpi_set_tensor_async,
             /* .get_tensor_async        = */ NULL,
-            /* .cpy_tensor_async        = */ NULL,
+            /* .cpy_tensor_async        = */ ggml_backend_mpi_cpy_tensor_async,
             /* .synchronize             = */ NULL,
             /* .graph_plan_create       = */ NULL,
             /* .graph_plan_free         = */ NULL,
@@ -617,9 +954,10 @@ ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t n
 static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
     // TODO check what the parameters are for. Could use it to setup the MPI comms and routes?
     GGML_UNUSED(params);
+    ggml_mpi_backend_init();
     auto * v = new std::vector<ggml_backend_t>();
     v->push_back(ggml_backend_cpu_init());
-    return ggml_backend_mpi_init(v->data(), 1);
+    return ggml_backend_mpi_init(v->data(), 1, 0);
 }
 
 
@@ -633,7 +971,7 @@ int ggml_backend_mpi_reg_devices() {
         ggml_backend_register(
                 device.name,
                 ggml_backend_reg_mpi_init,
-                ggml_backend_mpi_wrap_buffer(ggml_backend_cpu_buffer_type()),
+                ggml_backend_mpi_wrap_buffer_type(ggml_backend_cpu_buffer_type()),
                 reinterpret_cast<void *>(intptr_t(device.index))
         );
     }
@@ -642,4 +980,19 @@ int ggml_backend_mpi_reg_devices() {
 
 
 
+GGML_CALL void ggml_backend_mpi_buffer_type_set_rank(ggml_backend_buffer_type_t buft, int rank) {
+    if (buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+        ((ggml_backend_mpi_buffer_type_context *) buft->context)->rank = rank;
+    } else {
+        GGML_ASSERT(!"Buffer type must be wrapped in ggml_backend_mpi_buffer_type");
+    }
+}
 
+GGML_CALL void ggml_backend_mpi_buffer_set_rank(ggml_backend_buffer_t buf, int rank) {
+    if (buf->iface.get_name == ggml_backend_mpi_buffer_name) {
+        ((ggml_backend_mpi_buffer_context *) buf->context)->rank = rank;
+        ggml_backend_mpi_buffer_type_set_rank(buf->buft, rank);
+    } else {
+        GGML_ASSERT(!"Buffer type must be wrapped in ggml_backend_mpi_buffer_type");
+    }
+}
diff --git a/ggml-mpi.h b/ggml-mpi.h
index cbe5a51c0..0a5d69e14 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -53,7 +53,10 @@ struct ggml_mpi_context * ggml_mpi_init(void);
 
 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
 
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer_type(ggml_backend_buffer_type_t buft);
+
+GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_t buf);
+
 
 /**
  * Create a new context by splitting the given context's
@@ -210,7 +213,11 @@ struct ggml_mpi_device {
 #define MPI_BACKEND_NAME "MPI"
 GGML_CALL int ggml_backend_mpi_reg_devices();
 
-GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t num_backends);
+GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t num_backends, int rank);
+
+GGML_CALL void ggml_backend_mpi_buffer_type_set_rank(ggml_backend_buffer_type_t buft, int rank);
+
+GGML_CALL void ggml_backend_mpi_buffer_set_rank(ggml_backend_buffer_t buft, int rank);
 
 #ifdef __cplusplus
 }

From 619bf62acf4c79df4bc8b99a99f597f1092c009c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 12 Mar 2024 11:33:33 -0500
Subject: [PATCH 23/35] Support new MPI backend in llama.cpp and increase GGML
 max split inputs

---
 ggml.h    |  2 +-
 llama.cpp | 81 ++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/ggml.h b/ggml.h
index a4efe792d..3544e9d6a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -226,7 +226,7 @@
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
+#define GGML_MAX_CONTEXTS       128
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
diff --git a/llama.cpp b/llama.cpp
index f2f052bbf..0c0c783b1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1474,7 +1474,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
     }
 
 #if defined(GGML_USE_MPI)
-    buft = ggml_backend_mpi_wrap_buffer(buft);
+    buft = ggml_backend_mpi_wrap_buffer_type(buft);
 #endif
     return buft;
 
@@ -1528,9 +1528,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
         buft = llama_default_buffer_type_offload(fallback_gpu);
     }
 
-#if defined(GGML_USE_MPI)
-    buft = ggml_backend_mpi_wrap_buffer(buft);
-#endif
 
     return buft;
 
@@ -2177,7 +2174,7 @@ static bool llama_kv_cache_init(
         };
         ggml_context * ctx = ggml_init(params);
         if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
+            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache, n_layers=%d\n", __func__, n_layers);
             return false;
         }
         ctx_map[it.first] = ctx;
@@ -4099,15 +4096,23 @@ static bool llm_load_tensors(
     }
 
 #ifdef GGML_USE_MPI
-//    for (int64_t i = 0; i < n_layer; i++) {
-//        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
-//                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
-//    }
-//
-//    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
-//                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
-//    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
-//                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+    // TESTING: Setting all non-input/output layers to node 1
+    for (int64_t i = 0; i < n_layer; i++) {
+        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1);
+        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1);
+
+    }
+
+
+    // Will run with inputs on other nodes, but output may not be correct.
+    // Default is node 0 anyway, but better to be explicit about it
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_input.buft, 0);
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_input.buft_matrix, 0);
+
+
+    // Outputs *must* be on node 0, otherwise a deadlock occurs
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_output.buft, 0);
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_output.buft_matrix, 0);
 #endif
 
     // count used buffer types
@@ -4968,6 +4973,9 @@ static bool llm_load_tensors(
             size_t first, last;
             ml.get_mapping_range(&first, &last, ctx);
             buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
+#ifdef GGML_USE_MPI
+            buf = ggml_backend_mpi_wrap_buffer(buf);
+#endif
         }
 #ifdef GGML_USE_METAL
         else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
@@ -8784,7 +8792,7 @@ static void llama_graph_compute(
     ggml_backend_sched_graph_compute_async(lctx.sched, gf);
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-    
+
 }
 
 // decode a batch of tokens by evaluating the transformer
@@ -8800,7 +8808,14 @@ static int llama_decode_internal(
          llama_context & lctx,
            llama_batch   batch_all) { // TODO: rename back to batch
 
+
     uint32_t n_tokens_all = batch_all.n_tokens;
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits));
+    n_tokens_all = batch_all.n_tokens;
+#endif
+
     if (n_tokens_all == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
         return -1;
@@ -8900,12 +8915,7 @@ static int llama_decode_internal(
                 kv_self.head = 0;
             }
 
-    #ifdef GGML_USE_MPI
-        // TODO: needs fix after #3228
-        ggml_mpi_eval_init(lctx.ctx_mpi, &(u_batch.n_tokens), &(u_batch.pos), &(u_batch.n_seq_id), &(u_batch.seq_id), &(u_batch.logits));
-        n_tokens = u_batch.n_tokens;
-#endif
-        if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
+            if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
                 return 1;
             }
 
@@ -8991,7 +9001,11 @@ static int llama_decode_internal(
         // TODO: do not compute and extract logits if only embeddings are needed
         //       update the graphs to skip "result_output" if logits are not needed
         if (res) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
+    #ifdef GGML_USE_MPI
+        if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+#endif
+
+        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
             GGML_ASSERT(backend_res != nullptr);
             if (u_batch.logits) {
                 int32_t i_first = -1;
@@ -9092,6 +9106,10 @@ static int llama_decode_internal(
         }
     }
 
+#ifdef GGML_USE_MPI
+    }
+#endif
+
     return 0;
 }
 
@@ -13008,7 +13026,8 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef GGML_USE_MPI
 
-        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size())};
+
+        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)};
 
 
 
@@ -13134,14 +13153,14 @@ struct llama_context * llama_new_context_with_model(
 }
 
 void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
-#ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
-        GGML_ASSERT(false && "Must have same number of split percentages as devices");
-    }
-    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
-    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
-    free(ranges);
-#endif
+//#ifdef GGML_USE_MPI
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
+//        GGML_ASSERT(false && "Must have same number of split percentages as devices");
+//    }
+//    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
+//    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+//    free(ranges);
+//#endif
 }
 
 void llama_free(struct llama_context * ctx) {

From 01be58caa9362318701e107f651b3766e0c26ff0 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 12 Mar 2024 11:34:20 -0500
Subject: [PATCH 24/35] Fix simple to use new per-node thread count

---
 examples/simple/simple.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 39e2d8ea4..b31d77fbe 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -53,8 +53,8 @@ int main(int argc, char ** argv) {
 
     ctx_params.seed  = 1234;
     ctx_params.n_ctx = 2048;
-    ctx_params.n_threads = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_threads = params.n_threads[0];
+    ctx_params.n_threads_batch = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 

From c6280bc3f4938e09d059c41bc881f2db8393abf1 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 12 Mar 2024 12:40:23 -0500
Subject: [PATCH 25/35] Update to use backend GUID and changed signatures

---
 ggml-mpi.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index a16cc48b7..6ee5168ba 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -304,7 +304,7 @@ void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml
 
 
     for (int i = 0; i < gf->n_nodes; i++) {
-        gf->nodes[i]->backend = GGML_BACKEND_MPI_SPLIT;
+        gf->nodes[i]->backend = GGML_BACKEND_TYPE_MPI_SPLIT;
     }
 
 
@@ -382,7 +382,7 @@ GGML_CALL static const char * ggml_backend_mpi_buffer_name(ggml_backend_buffer_t
 
 GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft);
 
-GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
 
     struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
 
@@ -511,7 +511,7 @@ GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggm
                                                             ctx->backends.size(), cgraph->n_nodes);
 
 //    printf("Created new scheduler\n");
-        ggml_backend_sched_init_measure(sched, cgraph);
+        ggml_backend_sched_reserve(sched, cgraph);
 //    printf("Beginning sched graph compute\n");
         ggml_backend_sched_graph_compute(sched, cgraph);
 
@@ -593,7 +593,7 @@ GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggm
 
 //    ggml_mpi_graph_compute_post(ctx, cgraph);
 
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 
 
@@ -914,6 +914,8 @@ void ggml_backend_mpi_set_tensor_async(ggml_backend_t backend, struct ggml_tenso
 
 ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t num_backends, int rank) {
 
+    static ggml_guid backend_mpi_guid = {0xec, 0x39, 0xce, 0x40, 0xc3, 0x43, 0x49, 0x36, 0x96, 0x03, 0x55, 0x77, 0x5c, 0x1f, 0x44, 0xd3};
+
 
     ggml_mpi_context * ctx = ggml_mpi_init();
     std::vector<ggml_backend_t> wrapped_backends_v;
@@ -942,6 +944,7 @@ ggml_backend_t ggml_backend_mpi_init(ggml_backend_t * wrapped_backends, size_t n
     };
 
     auto *mpi_backend = new ggml_backend {
+            /* .guid      = */ &backend_mpi_guid,
             /* .interface = */ mpi_backend_i,
             /* .context   = */ ctx,
     };

From 72dcd66c0fe23f7639728641790b96acd0ed575f Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 12 Mar 2024 14:20:03 -0500
Subject: [PATCH 26/35] Resize seq_ids by n_seq_max, port over sync_pipelined
 instead of using Bcast

---
 ggml-mpi.cpp | 200 +++++++++++++++++++++++++++++++++++++--------------
 ggml-mpi.h   |  47 +++++++++++-
 llama.cpp    |  21 ++++--
 3 files changed, 207 insertions(+), 61 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 6ee5168ba..37e5f67c7 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -14,6 +14,10 @@
 
 #define UNUSED GGML_UNUSED
 
+static bool have_init = false;
+
+static void* send_buffer;
+
 struct ggml_mpi_context {
     int rank;
     int size;
@@ -26,18 +30,37 @@ struct ggml_mpi_context {
     std::vector<ggml_backend_t> backends;
     ggml_backend_sched_t scheduler;
     bool remote;
+    void* send_buffer;
 };
 
 void ggml_mpi_backend_init(void) {
     int ret;
-    MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &ret);
+
+    GGML_ASSERT(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &ret) == MPI_SUCCESS);
+    have_init = true;
+    const int buffer_size = 128*1024*1024*8;
+    send_buffer = calloc(1, buffer_size); // 128MB buffer
+//    fprintf(stderr, "BUFFER ATTACH RETCODE=%d\n", MPI_Buffer_attach(send_buffer, buffer_size));
 }
 
+void ggml_mpi_sync_pipelined(
+        struct ggml_mpi_context *   ctx_mpi,
+        void * val,
+        int count,
+        MPI_Datatype datatype,
+        int tag
+);
+
 void ggml_mpi_backend_free(void) {
     MPI_Finalize();
 }
 
 struct ggml_mpi_context * ggml_mpi_init(void) {
+
+    if (!have_init) {
+        ggml_mpi_backend_init();
+    }
+
     auto * ctx = new ggml_mpi_context;
 
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
@@ -45,6 +68,8 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
     ctx->comm = MPI_COMM_WORLD;
     ctx->remote = false;
 
+    ctx->send_buffer = send_buffer;
+
     return ctx;
 }
 
@@ -69,78 +94,147 @@ size_t ggml_mpi_size(struct ggml_mpi_context * ctx) {
     return ctx->size;
 }
 
+int ggml_mpi_next_node(struct ggml_mpi_context * ctx_mpi) {
+    return (ctx_mpi->rank + 1) % ctx_mpi->size;
+}
+
+int ggml_mpi_prev_node(struct ggml_mpi_context * ctx_mpi) {
+    int temp = (ctx_mpi->rank - 1);
+    return (temp >= 0) ? temp : ctx_mpi->size - 1;
+}
+
+void ggml_mpi_sync_pipelined(
+        struct ggml_mpi_context *   ctx_mpi,
+        void * val,
+        int count,
+        MPI_Datatype datatype,
+        int tag
+) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
+
+//    printf("Rank %d sync pipelined with tag %d\n", ctx_mpi->rank, tag);
+
+
+    if (ctx_mpi->rank != 0) {
+        MPI_Recv(val, count, datatype, ggml_mpi_prev_node(ctx_mpi), tag, ctx_mpi->comm, MPI_STATUS_IGNORE);
+    }
+    if(ctx_mpi->rank < ctx_mpi->size - 1) {
+        GGML_ASSERT(ctx_mpi->send_buffer != nullptr);
+        const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
+        GGML_ASSERT(retval == MPI_SUCCESS);
+
+    }
+}
+
 void ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
                 int32_t         **  pos,
                 int32_t         **  n_seq_ids,
                 int32_t         *** seq_id,
-                int8_t          **  logits) {
+                int8_t          **  logits,
+                uint32_t            n_seq_max) {
 
 
-//    fprintf(stderr, "Beginning eval init on rank %d\n", ctx_mpi->rank);
-    MPI_Barrier(ctx_mpi->comm);
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return;
+    }
     int32_t old_n_tokens = *n_tokens;
-    MPI_Bcast(n_tokens, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 
-//    fprintf(stderr, "Node %d, old_n_tokens: %d, new n_tokens: %d\n", ctx_mpi->rank, old_n_tokens, *n_tokens);
 
-    // If what was passed in differs from what was broadcast,
-    // we can't guarantee the allocated sizes are correct
-    // TODO check how often this is done and if it's a problem,
-    //      try to allocate ahead of time
-    if (old_n_tokens != *n_tokens) {
-        *pos = static_cast<int32_t *>(realloc(*pos, *n_tokens * sizeof(int32_t)));
-        *n_seq_ids = static_cast<int32_t *>(realloc(*n_seq_ids, *n_tokens * sizeof(int32_t)));
-        *logits = static_cast<int8_t *>(realloc(*logits, *n_tokens * sizeof(int32_t)));
+    ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, GGML_MPI_N_TOKENS);
+    int8_t* temp_logits = (int8_t*) calloc(*n_tokens, sizeof(int8_t));
+
+    if (ctx_mpi->rank == 0 && *logits != nullptr) {
+        ggml_mpi_sync_pipelined(ctx_mpi, *logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
+    } else {
+        ggml_mpi_sync_pipelined(ctx_mpi, temp_logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
     }
 
 
 
-//    MPI_Bcast(&total_n_seq_ids,     1, MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(*n_seq_ids,   *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm);
+    if (ctx_mpi->rank != 0) {
+        bool should_set_batch_logits = false;
+        for (int i = 0; i < *n_tokens; i++) {
+            if (temp_logits[i]) {
+                should_set_batch_logits = true;
+                break;
+            }
+        }
+        if (should_set_batch_logits) {
+            if (*logits != NULL) {
+                free(*logits);
+                *logits = NULL;
+            }
+            *logits = temp_logits;
+        } else {
+            if (*logits != NULL) {
+                free(*logits);
+                *logits = NULL;
+            }
+            free(temp_logits);
+        }
+    } else {
+        free(temp_logits);
+    }
+
+    // For now, we assume that the pos, seq_ids, tokens, etc have been
+    // pre-allocated for the largest possible sizes, even on worker nodes.
+    //if (old_n_tokens != *n_tokens) {
+    //    *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
+    //    *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
+    //    *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
+    //}
+
+    GGML_ASSERT(n_seq_ids != nullptr);
+    GGML_ASSERT(n_tokens != nullptr);
+
+
+    // FIXME Syncing n_seq_ids causes MPI to throw an invalid buffer error in Bsend
+//    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, GGML_MPI_N_SEQ_IDS);
 
     // We need to know the total number of sequence
     // ids, so we count them all up
-    int32_t total_n_seq_ids = 0;
-    for (int32_t i = 0; i < *n_tokens; i++) {
-        total_n_seq_ids += (*n_seq_ids)[i];
-    }
-
-    // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
-    // for transit
-    auto * flattened_seq_ids = static_cast<int32_t *>(calloc(total_n_seq_ids, sizeof(int32_t)));
-
-    int32_t current_index = 0;
-
-    // Only rank 0 needs to flatten since the others don't have the real seq_id
-    if (ctx_mpi->rank == 0) {
-        for (int32_t i = 0; i < *n_tokens; i++) {
-            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
-                flattened_seq_ids[current_index] = (*seq_id)[i][j];
-                current_index++;
-            }
-        }
-    }
-
-
-    MPI_Bcast(             *pos, *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
-    MPI_Bcast(flattened_seq_ids,  total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
-    //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
-    auto ** new_seq_id = static_cast<int32_t **>(calloc(*n_tokens, sizeof(int32_t *)));
-    current_index = 0;
-    for (int32_t i = 0; i < *n_tokens; i++) {
-        new_seq_id[i] = static_cast<int32_t *>(calloc((*n_seq_ids)[i], sizeof(int32_t)));
-        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
-            new_seq_id[i][j] = flattened_seq_ids[current_index];
-            current_index++;
-        }
-    }
-    free(flattened_seq_ids);
-    //free(*seq_id); // <- something is still holding onto this, need to investigate
-    *seq_id = new_seq_id;
+//    int32_t total_n_seq_ids = 0;
+//    for (int32_t i = 0; i < *n_tokens; i++) {
+//        total_n_seq_ids += (*n_seq_ids)[i];
+//    }
+//
+//    // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
+//    // for transit
+//    int32_t * flattened_seq_ids = static_cast<int32_t *>(calloc(total_n_seq_ids, sizeof(int32_t)));
+//
+//    int32_t current_index = 0;
+//
+//    // Only rank 0 needs to flatten since the others don't have the real seq_id
+//    if (ctx_mpi->rank == 0) {
+//        for (int32_t i = 0; i < *n_tokens; i++) {
+//            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
+//                flattened_seq_ids[current_index] = (*seq_id)[i][j];
+//                current_index++;
+//            }
+//        }
+//    }
+//
+//
+//
+//    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, GGML_MPI_POS);
+//    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, GGML_MPI_SEQ_IDS);
+//
+//    current_index = 0;
+//    for (int32_t i = 0; i < *n_tokens; i++) {
+//        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
+//            (*seq_id)[i][j] = flattened_seq_ids[current_index];
+//            current_index++;
+//        }
+//
+//    }
+//    free(flattened_seq_ids);
 }
 
+
 void ggml_mpi_synch_int(
         struct ggml_mpi_context * ctx_mpi,
                         int32_t * val
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 0a5d69e14..6497f47c8 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -12,6 +12,50 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
+#define GGML_MPI_DECODE 0
+
+#define GGML_MPI_KV_CLEAR 1
+
+#define GGML_MPI_KV_SEQ_RM 2
+
+#define GGML_MPI_KV_SEQ_CP 3
+
+#define GGML_MPI_KV_SEQ_KEEP 4
+
+#define GGML_MPI_KV_SEQ_SHIFT 5
+
+#define GGML_MPI_SHUTDOWN 6
+
+#define GGML_MPI_TRANSFER_TENSORS 7
+
+#define GGML_MPI_SYNC_LOGITS 8
+
+#define GGML_MPI_CANCEL_RUN 9
+
+#define GGML_MPI_KV_SEQ_CP_BACK 10
+
+#define GGML_MPI_TRANS_ID 11
+
+#define GGML_MPI_BATCH_ID 12
+
+#define GGML_MPI_N_TOKENS 13
+
+#define GGML_MPI_TOKENS 14
+
+#define GGML_MPI_N_SEQ_IDS 15
+
+#define GGML_MPI_SEQ_IDS 16
+
+#define GGML_MPI_POS 17
+
+#define GGML_MPI_BEGIN_TRANSACTION 18
+
+#define GGML_MPI_MAX_N_SEQ 19
+
+#define GGML_MPI_BATCH_LOGITS 20
+
+
+
 /**
  * The context used for MPI operations,
  * a program may make use of more than one
@@ -131,7 +175,8 @@ void ggml_mpi_eval_init(
                 int32_t         **  pos,
                 int32_t         **  n_seq_ids,
                 int32_t         *** seq_id,
-                int8_t          **  logits);
+                int8_t          **  logits,
+                uint32_t            n_seq_max);
 
 void ggml_mpi_synch_int(
         struct ggml_mpi_context     * ctx_mpi,
diff --git a/llama.cpp b/llama.cpp
index 0c0c783b1..9e0343cad 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1750,6 +1750,7 @@ struct llama_cparams {
 
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
+    uint32_t n_seq_max;
 };
 
 struct llama_layer {
@@ -8812,7 +8813,7 @@ static int llama_decode_internal(
     uint32_t n_tokens_all = batch_all.n_tokens;
 
 #ifdef GGML_USE_MPI
-    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits));
+    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
     n_tokens_all = batch_all.n_tokens;
 #endif
 
@@ -8896,7 +8897,7 @@ static int llama_decode_internal(
             seq_id_arr.resize(n_tokens);
             for (uint32_t i = 0; i < n_tokens; i++) {
                 n_seq_id[i] = 1;
-                seq_id[i].resize(1);
+                seq_id[i].resize(lctx.cparams.n_seq_max);
                 seq_id[i][0] = u_batch.all_seq_id;
                 seq_id_arr[i] = seq_id[i].data();
             }
@@ -12753,6 +12754,9 @@ void llama_backend_init(void) {
         ggml_free(ctx);
     }
 
+#ifdef GGML_USE_MPI
+    ggml_mpi_backend_init();
+#endif
 
 }
 
@@ -12760,10 +12764,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
     if (numa != GGML_NUMA_STRATEGY_DISABLED) {
         ggml_numa_init(numa);
     }
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_backend_init();
-#endif
 }
 
 void llama_backend_free(void) {
@@ -12844,7 +12844,7 @@ struct llama_context * llama_new_context_with_model(
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
 
-    // TODO: maybe add n_seq_max here too
+    cparams.n_seq_max        = params.n_seq_max;
     cparams.n_threads        = params.n_threads;
     cparams.n_threads_batch  = params.n_threads_batch;
     cparams.yarn_ext_factor  = params.yarn_ext_factor;
@@ -13984,6 +13984,13 @@ void llama_batch_free(struct llama_batch batch) {
         free(batch.seq_id);
     }
     if (batch.logits)   free(batch.logits);
+
+    batch.token = nullptr;
+    batch.embd = nullptr;
+    batch.pos = nullptr;
+    batch.n_seq_id = nullptr;
+    batch.seq_id = nullptr;
+    batch.logits = nullptr;
 }
 
 int32_t llama_decode(

From 5f156f3a0cf751dc91b57d2e017231dac255f85c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 12 Mar 2024 18:17:42 -0500
Subject: [PATCH 27/35] Clean up MPI backend a tad

---
 ggml-mpi.cpp | 534 ++++++++++++++++++++-------------------------------
 ggml-mpi.h   |  43 +----
 2 files changed, 209 insertions(+), 368 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 37e5f67c7..30e74c8bc 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -141,7 +141,6 @@ void ggml_mpi_eval_init(
     if(ctx_mpi->comm == MPI_COMM_NULL) {
         return;
     }
-    int32_t old_n_tokens = *n_tokens;
 
 
     ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, GGML_MPI_N_TOKENS);
@@ -182,11 +181,6 @@ void ggml_mpi_eval_init(
 
     // For now, we assume that the pos, seq_ids, tokens, etc have been
     // pre-allocated for the largest possible sizes, even on worker nodes.
-    //if (old_n_tokens != *n_tokens) {
-    //    *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
-    //    *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
-    //    *tokens = realloc(*tokens, *n_tokens * sizeof(int32_t ));
-    //}
 
     GGML_ASSERT(n_seq_ids != nullptr);
     GGML_ASSERT(n_tokens != nullptr);
@@ -235,30 +229,13 @@ void ggml_mpi_eval_init(
 }
 
 
-void ggml_mpi_synch_int(
+void ggml_mpi_sync_int(
         struct ggml_mpi_context * ctx_mpi,
                         int32_t * val
 ) {
     MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 }
 
-static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
-    struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
-    if (t == NULL) {
-        fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
-        return -1;
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        if (gf->nodes[i] == t) {
-            return i;
-        }
-    }
-
-    fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
-    return -1;
-}
-
 static void ggml_mpi_tensor_send(const struct ggml_tensor * t, const void* data, int mpi_rank_dst, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
@@ -340,154 +317,172 @@ uint16_t** ggml_mpi_split_range(
 
 }
 
-void ggml_mpi_scatter_layers(
-    struct ggml_mpi_context * ctx_mpi,
-    uint16_t ** layer_ranges
-) {
-    // Layer ranges is a 2d array with the first dimension
-    // having a length of the number of nodes and the second
-    // dimension having a length of 2. The inner arrays contain
-    // the start and end layer ID for a node.
-    uint16_t flattened_ranges[ctx_mpi->size * 2];
-
-    if (layer_ranges != NULL) {
-        for (int i = 0; i < ctx_mpi->size * 2; i += 2) {
-            flattened_ranges[i] = layer_ranges[i/2][0];
-            flattened_ranges[i + 1] = layer_ranges[i/2][1];
-        }
-    }
-
-    uint16_t received_range[2];
-    MPI_Scatter(flattened_ranges, 2, MPI_UINT16_T, received_range, 2, MPI_UINT16_T, 0, ctx_mpi->comm);
-    ctx_mpi->layer_start = received_range[0];
-    ctx_mpi->layer_end = received_range[1];
-    fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
-}
-
-void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int   n_layers) {
-
-    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
-    if (inp_tokens == NULL) {
-        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
-        return;
-    }
-
-    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
-    if (inp0 == NULL) {
-        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
-        return;
-    }
-
-    ctx_mpi->inp0 = inp0;
-
-//    fprintf(stderr, "gf->nodes[0] == %s\n", ggml_get_name(gf->nodes[0]));
-//
-//    GGML_ASSERT(inp0 == gf->nodes[0]);
-
-    // distribute the compute graph into slices across the MPI nodes
-    //
-    // the main node (0) processes the last layers + the remainder of the compute graph
-    // and is responsible to pass the input tokens to the first node (1)
-    //
-    // node 1:   [(  0) * n_per_node, (  1) * n_per_node)
-    // node 2:   [(  1) * n_per_node, (  2) * n_per_node)
-    // ...
-    // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
-    // node 0:   [(n-1) * n_per_node,            n_nodes)
-    //
-
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        gf->nodes[i]->backend = GGML_BACKEND_TYPE_MPI_SPLIT;
-    }
-
-
-}
-
-// TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf) {
-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
-
-    struct ggml_tensor * inp0 = gf->nodes[0];
-    if (inp0 == NULL) {
-        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
-        return;
-    }
-
-    if (mpi_rank > 0) {
-//        ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
-//        if (mpi_rank == 1) {
-//            // the first node (1) receives the input tokens from the main node (0)
-//            if (inp_tokens->data == NULL) {
-//
-//            }
-//            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
-//        } else {
-//            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-//            fprintf(stderr, "%s:%d: receiving layer inp0\n", __func__, ctx_mpi->rank);
-//            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
-//        }
-    } else if (mpi_size > 1) {
-        // node 0 sends the input tokens to node 1
-//        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
-
-        // recv the output data from the last node
-//        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
-    }
-}
-
-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf) {
-
-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
-
-    // send the output data to the next node
-    if (mpi_rank > 0) {
-//        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
-    }
-}
-
 // BACKEND V2
 
 struct ggml_backend_mpi_buffer_context {
     ggml_backend_buffer_t wrapped_buffer;
-    int rank;
+    ggml_mpi_context * ctx_mpi;
 };
 
 struct ggml_backend_mpi_buffer_type_context {
     std::string name;
-    ggml_backend_buffer_type_t wrapped_buffer;
-    int rank;
+    ggml_backend_buffer_type_t wrapped_buffer_type;
+    ggml_mpi_context * ctx_mpi;
 };
 
-GGML_CALL static const char * ggml_backend_mpi_buffer_name(ggml_backend_buffer_t buffer) {
-    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+int ggml_backend_mpi_buffer_type_rank(ggml_backend_buffer_type_t buft);
 
-    int rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+int ggml_backend_mpi_buffer_type_local_rank(ggml_backend_buffer_type_t buft);
 
-    return strdup((("MPI Buffer(Rank " + std::to_string(ctx->rank) + ", local rank " + std::to_string(rank) + "):") + std::string(ctx->wrapped_buffer->iface.get_name(ctx->wrapped_buffer))).c_str());
+GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+
+
+    return strdup(
+            (
+                    ctx->name +
+                    " Buffer Type(Rank " +
+                    std::to_string(
+                            ggml_backend_mpi_buffer_type_rank(buft)
+                    ) +
+                    ", local rank " +
+                    std::to_string(ggml_backend_mpi_buffer_type_local_rank(buft)) +
+                    "):" +
+                    std::string(
+                            ctx->wrapped_buffer_type->iface.get_name(ctx->wrapped_buffer_type)
+                    )
+            ).c_str()
+    );
 }
 
+MPI_Comm ggml_backend_mpi_buffer_type_get_comm(ggml_backend_buffer_type_t buft) {
+    auto * buft_ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+    return buft_ctx->ctx_mpi->comm;
+
+}
+
+MPI_Comm ggml_backend_mpi_buffer_get_comm(ggml_backend_buffer_t buffer) {
+    return ggml_backend_mpi_buffer_type_get_comm(buffer->buft);
+}
+
+MPI_Comm ggml_backend_mpi_get_comm(ggml_backend_t backend) {
+    auto * ctx = (ggml_mpi_context *) backend->context;
+
+    return ctx->comm;
+}
+
+int ggml_backend_mpi_buffer_local_rank(ggml_backend_buffer_t buffer) {
+    int rank;
+    int ret = MPI_Comm_rank(ggml_backend_mpi_buffer_get_comm(buffer), &rank);
+    GGML_ASSERT(ret == MPI_SUCCESS);
+    return rank;
+}
+
+int ggml_backend_mpi_buffer_type_local_rank(ggml_backend_buffer_type_t buft) {
+    int rank;
+    int ret = MPI_Comm_rank(ggml_backend_mpi_buffer_type_get_comm(buft), &rank);
+    GGML_ASSERT(ret == MPI_SUCCESS);
+    return rank;
+}
+
+int ggml_backend_mpi_local_rank(ggml_backend_t backend) {
+    int rank;
+    int ret = MPI_Comm_rank(ggml_backend_mpi_get_comm(backend), &rank);
+    GGML_ASSERT(ret == MPI_SUCCESS);
+    return rank;
+}
+
+int ggml_backend_mpi_buffer_rank(ggml_backend_buffer_t buffer) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+    return ctx->ctx_mpi->rank;
+}
+
+int ggml_backend_mpi_buffer_type_rank(ggml_backend_buffer_type_t buft) {
+    GGML_ASSERT(buft->iface.get_name == ggml_backend_mpi_buffer_type_name);
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+    GGML_ASSERT(ctx != nullptr);
+    GGML_ASSERT(ctx->ctx_mpi != nullptr);
+    return ctx->ctx_mpi->rank;
+}
+
+int ggml_backend_mpi_rank(ggml_backend_t backend) {
+    auto * ctx = (ggml_mpi_context *) backend->context;
+    return ctx->rank;
+}
+
+ggml_backend_buffer_t ggml_backend_mpi_buffer_unwrap(ggml_backend_buffer_t buffer) {
+    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+
+    ggml_backend_buffer_t wrapped_buffer = ctx->wrapped_buffer;
+    wrapped_buffer->usage = buffer->usage;
+    wrapped_buffer->size = buffer->size;
+    return wrapped_buffer;
+
+}
+
+ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type_unwrap(ggml_backend_buffer_type_t buft) {
+    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+
+    ggml_backend_buffer_type_t wrapped_buffer_type = ctx->wrapped_buffer_type;
+    return wrapped_buffer_type;
+
+}
+
+
+GGML_CALL static const char * ggml_backend_mpi_buffer_name(ggml_backend_buffer_t buffer) {
+
+
+
+    return strdup(
+            (
+
+                    "MPI Buffer(Rank " +
+                    std::to_string(ggml_backend_mpi_buffer_rank(buffer)) +
+                    ", local rank " +
+                    std::to_string(ggml_backend_mpi_buffer_local_rank(buffer)) +
+                    "):" +
+                    std::string(
+                            ggml_backend_buffer_name(
+                                    ggml_backend_mpi_buffer_unwrap(buffer)
+                            )
+                    )
+            ).c_str()
+    );
+}
+
+
 GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft);
 
+GGML_CALL void ggml_backend_mpi_buffer_type_copy_ctx(ggml_backend_buffer_type_t src, ggml_backend_buffer_type_t dst) {
+    if (src->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+        *((ggml_backend_mpi_buffer_type_context *) dst->context)->ctx_mpi = *((ggml_backend_mpi_buffer_type_context *) src->context)->ctx_mpi;
+    } else {
+        GGML_ASSERT(!"Buffer type must be wrapped in ggml_backend_mpi_buffer_type_t");
+    }
+}
+
+GGML_CALL void ggml_backend_mpi_buffer_copy_ctx(ggml_backend_buffer_t src, ggml_backend_buffer_t dst) {
+    if (src->iface.get_name == ggml_backend_mpi_buffer_name) {
+        *((ggml_backend_mpi_buffer_context *) dst->context)->ctx_mpi = *((ggml_backend_mpi_buffer_context *) src->context)->ctx_mpi;
+        ggml_backend_mpi_buffer_type_copy_ctx(src->buft, dst->buft);
+    } else {
+        GGML_ASSERT(!"Buffer must be wrapped in ggml_backend_mpi_buffer_t");
+    }
+}
+
+GGML_CALL void ggml_backend_mpi_buffer_copy_ctx_from_type(ggml_backend_buffer_type_t src, ggml_backend_buffer_t dst) {
+    if (src->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+        *((ggml_backend_mpi_buffer_context *) dst->context)->ctx_mpi = *((ggml_backend_mpi_buffer_type_context *) src->context)->ctx_mpi;
+        ggml_backend_mpi_buffer_type_copy_ctx(src, dst->buft);
+    } else {
+        GGML_ASSERT(!"Buffer must be wrapped in ggml_backend_mpi_buffer_t");
+    }
+}
+
 GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
 
     struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
 
-
-//    if (ctx->remote) {
-//        return true;
-//    }
-
-
-//    ggml_mpi_graph_compute_pre(ctx, cgraph);
-
     std::vector<ggml_backend_buffer_type_t> backend_buft;
     for (auto *curr_backend: ctx->backends) {
         if (ggml_backend_is_cpu(curr_backend)) {
@@ -498,195 +493,115 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
         }
     }
 
-//    ggml_backend_t wrapped_backend = ctx->wrapped_backend;
-//    bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
-//    printf("Running MPI backend\n");
 
-    std::vector<std::pair<ggml_backend_buffer_t, std::vector<ggml_backend_buffer_type_t>>> old_buffs(
+    std::vector<std::pair<ggml_backend_buffer_t, std::vector<ggml_backend_buffer_t>>> old_buffs(
             cgraph->n_nodes);
-    std::vector<ggml_backend_buffer_type_t> old_view_buffs(cgraph->n_nodes);
+    std::vector<ggml_backend_buffer_t> old_view_buffs(cgraph->n_nodes);
 
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         old_buffs[i].first = cgraph->nodes[i]->buffer;
-//        if (cgraph->nodes[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-//            cgraph->nodes[i]->buffer = ((ggml_backend_mpi_buffer_context *) cgraph->nodes[i]->buffer->context)->wrapped_buffer;
-////            printf("Unwrapped buffer: %s\n", cgraph->nodes[i]->buffer->buft->iface.get_name(cgraph->nodes[i]->buffer->buft));
-//        }
+
 
         for (auto &src: cgraph->nodes[i]->src) {
             if (src == nullptr) {
                 break;
             }
-            old_buffs[i].second.push_back(src->buffer->buft);
-//            if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-//                src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
-////                printf("Unwrapped buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
-//            }
+            old_buffs[i].second.push_back(src->buffer);
+
         }
 
         auto *src = cgraph->nodes[i]->view_src;
         if (src != nullptr) {
-//            fprintf(stderr, "View src is not null, src=%s, src buffer=%s\n", src->name, ggml_backend_buffer_name(src->buffer));
             if (src->buffer->buft != nullptr) {
-//                fprintf(stderr, "View src buffer type is not null, buft=%s\n", ggml_backend_buft_name(src->buffer->buft));
-                old_view_buffs[i] = src->buffer->buft;
-//                if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-//                    src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
-//                    printf("Unwrapped view buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
-//                }
-            } else {
-//                old_view_buffs[i] = ggml_backend_mpi_wrap_buffer_type(ggml_backend_cpu_buffer_type());
-//                ggml_backend_mpi_buffer_type_set_rank(old_view_buffs[i], ((ggml_backend_mpi_buffer_context*)src->buffer->context)->rank);
+                old_view_buffs[i] = src->buffer;
+
             }
-        } else {
-//            fprintf(stderr, "OLD VIEW BUFF IS NULL\n");
         }
     }
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (cgraph->nodes[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-            auto usage = cgraph->nodes[i]->buffer->usage;
-            cgraph->nodes[i]->buffer = ((ggml_backend_mpi_buffer_context *) cgraph->nodes[i]->buffer->context)->wrapped_buffer;
-            cgraph->nodes[i]->buffer->usage = usage;
-//            printf("Unwrapped buffer: %s\n", cgraph->nodes[i]->buffer->buft->iface.get_name(cgraph->nodes[i]->buffer->buft));
+            cgraph->nodes[i]->buffer = ggml_backend_mpi_buffer_unwrap(cgraph->nodes[i]->buffer);
         }
 
         for (auto &src: cgraph->nodes[i]->src) {
             if (src == nullptr) {
                 break;
             }
-//            old_buffs[i].second.push_back(src->buffer->buft);
             if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-                auto usage = src->buffer->usage;
-                src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
-                src->buffer->usage = usage;
-//                printf("Unwrapped buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+                src->buffer = ggml_backend_mpi_buffer_unwrap(src->buffer);
             }
         }
 
         auto *src = cgraph->nodes[i]->view_src;
         if (src != nullptr) {
-//            fprintf(stderr, "View src is not null, src=%s, src buffer=%s\n", src->name, ggml_backend_buffer_name(src->buffer));
             if (src->buffer->buft != nullptr) {
-//                fprintf(stderr, "View src buffer type is not null, buft=%s\n", ggml_backend_buft_name(src->buffer->buft));
-//                old_view_buffs[i] = src->buffer->buft;
-                if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-                    auto usage = src->buffer->usage;
 
-                    src->buffer = ((ggml_backend_mpi_buffer_context *) src->buffer->context)->wrapped_buffer;
-                    src->buffer->usage = usage;
-//                    printf("Unwrapped view buffer src: %s\n", src->buffer->buft->iface.get_name(src->buffer->buft));
+                if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                    src->buffer = ggml_backend_mpi_buffer_unwrap(src->buffer);
                 }
-            } else {
-//                old_view_buffs[i] = ggml_backend_mpi_wrap_buffer_type(ggml_backend_cpu_buffer_type());
-//                ggml_backend_mpi_buffer_type_set_rank(old_view_buffs[i], ((ggml_backend_mpi_buffer_context*)src->buffer->context)->rank);
             }
-        } else {
-//            fprintf(stderr, "OLD VIEW BUFF IS NULL\n");
         }
     }
-
-//    fprintf(stderr, "Original n_leafs: %d\n", cgraph->n_leafs);
-
     std::vector<ggml_backend_buffer_type_t> old_buffs_leaves;
     for (int i = 0; i < cgraph->n_leafs; i++) {
-//        fprintf(stderr, "Pushing leaf %s\n", cgraph->leafs[i]->name);
         old_buffs_leaves.push_back(cgraph->leafs[i]->buffer->buft);
         if (cgraph->leafs[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-            cgraph->leafs[i]->buffer->buft = ((ggml_backend_mpi_buffer_type_context *) cgraph->leafs[i]->buffer->buft->context)->wrapped_buffer;
-//            printf("Unwrapped buffer: %s\n", cgraph->leafs[i]->buffer->buft->iface.get_name(cgraph->leafs[i]->buffer->buft));
+            cgraph->leafs[i]->buffer = ggml_backend_mpi_buffer_unwrap(cgraph->leafs[i]->buffer);
         }
     }
 
 
     if (!ctx->remote) {
         ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(),
-                                                            ctx->backends.size(), cgraph->n_nodes);
+                                                            (int) ctx->backends.size(), cgraph->n_nodes);
 
-//    printf("Created new scheduler\n");
         ggml_backend_sched_reserve(sched, cgraph);
-//    printf("Beginning sched graph compute\n");
         ggml_backend_sched_graph_compute(sched, cgraph);
-
         ggml_backend_sched_free(sched);
 
     }
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-//        fprintf(stderr, "Wrapping buffer %s for node %s\n", cgraph->nodes[i]->name, ggml_backend_buffer_name(cgraph->nodes[i]->buffer));
         cgraph->nodes[i]->buffer = ggml_backend_mpi_wrap_buffer(cgraph->nodes[i]->buffer);
 
-//        fprintf(stderr, "Setting buffer ranks for node %s with old buff %s\n", cgraph->nodes[i]->name, ggml_backend_buffer_name(old_buffs[i].first));
-
-        ggml_backend_mpi_buffer_set_rank(cgraph->nodes[i]->buffer, ((ggml_backend_mpi_buffer_context*)old_buffs[i].first->context)->rank);
-
-//        fprintf(stderr, "New buffer rank for node %s: %d\n", cgraph->nodes[i]->name, ctx->rank);
+        ggml_backend_mpi_buffer_set_rank(cgraph->nodes[i]->buffer, ggml_backend_mpi_buffer_rank(old_buffs[i].first));
 
 
         for (int iter = 0; iter < GGML_MAX_SRC; iter++) {
-            auto* j = cgraph->nodes[i]->src[iter];
-            if (j == nullptr) {
+            auto* src_node = cgraph->nodes[i]->src[iter];
+            if (src_node == nullptr) {
                 break;
             }
 
-            if (j->buffer->iface.get_name == ggml_backend_mpi_buffer_name) {
-//                fprintf(stderr, "Skipping buffer ranks for src node %s with buffer type %s\n", j->name, j->buffer->buft->iface.get_name(j->buffer->buft));
+            if (src_node->buffer->iface.get_name == ggml_backend_mpi_buffer_name) {
                 continue;
             }
 
-//            fprintf(stderr, "Setting buffer ranks for src node %s\n", j->name);
+            src_node->buffer = ggml_backend_mpi_wrap_buffer(src_node->buffer);
 
-
-//            fprintf(stderr, "Source buffer name: %s, buffer type name: %s\n",
-//                    j->buffer->iface.get_name(j->buffer),
-//                    j->buffer->buft->iface.get_name(j->buffer->buft));
-
-            j->buffer = ggml_backend_mpi_wrap_buffer(j->buffer);
-
-            ggml_backend_mpi_buffer_set_rank(j->buffer, ((ggml_backend_mpi_buffer_type_context*)old_buffs[i].second[iter]->context)->rank);
-//            j->buffer->buft = old_buffs[i].second[iter];
-
-//            fprintf(stderr, "New source buffer name: %s, buffer type name: %s\n",
-//                    j->buffer->iface.get_name(j->buffer),
-//                    j->buffer->buft->iface.get_name(j->buffer->buft));
-
-//            ggml_backend_mpi_buffer_type_set_rank(j->buffer->buft, ctx->rank);
+            ggml_backend_mpi_buffer_set_rank(src_node->buffer, ggml_backend_mpi_buffer_rank(old_buffs[i].second[iter]));
         }
         if(cgraph->nodes[i]->view_src != nullptr && cgraph->nodes[i]->view_src->buffer->buft != nullptr) {
-//            fprintf(stderr, "View source %s (buffer name: %s), buffer type name: %s\n", cgraph->nodes[i]->view_src->name,
-//                    cgraph->nodes[i]->view_src->buffer->iface.get_name(cgraph->nodes[i]->view_src->buffer),
-//                    cgraph->nodes[i]->view_src->buffer->buft->iface.get_name(cgraph->nodes[i]->view_src->buffer->buft));
 
-//            fprintf(stderr, "Old view source buffer type name: %s\n",
-//                    old_view_buffs[i]->iface.get_name(old_view_buffs[i]));
-//            cgraph->nodes[i]->view_src->buffer = ggml_backend_mpi_wrap_buffer(cgraph->nodes[i]->view_src->buffer);
-            // WRONG, need to keep the source ranks from before compute
-//            fprintf(stderr, "View buff %s null\n", (old_view_buffs[i]->context == nullptr) ? " is " : "is NOT ");
             if (old_view_buffs[i] != nullptr) {
-                if (old_view_buffs[i]->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                if (old_view_buffs[i]->iface.get_name == ggml_backend_mpi_buffer_name) {
                     ggml_backend_mpi_buffer_set_rank(cgraph->nodes[i]->view_src->buffer,
-                                                     ((ggml_backend_mpi_buffer_type_context *) old_view_buffs[i]->context)->rank);
-                } else {
-//                    ggml_backend_mpi_buffer_set_rank(cgraph->nodes[i]->view_src->buffer,
-//                                                     ctx->rank);
+                                                     ggml_backend_mpi_buffer_rank(old_view_buffs[i]));
                 }
             }
         }
 
     }
 
+
+    // FIXME check if this is correct or not (it's probably not)
     for (int i = 0; i < cgraph->n_leafs; i++) {
-//        fprintf(stderr, "Wrapping leaf %s...\n", cgraph->leafs[i]->name);
         cgraph->leafs[i]->buffer = ggml_backend_mpi_wrap_buffer(cgraph->leafs[i]->buffer);
-//        fprintf(stderr, "Wrapped leaf buffer: %s\n", ggml_backend_buffer_name(cgraph->leafs[i]->buffer));
         ggml_backend_mpi_buffer_type_set_rank(cgraph->leafs[i]->buffer->buft, ctx->rank);
-//        fprintf(stderr, "Wrapped leaf after setting rank: %s\n", ggml_backend_buffer_name(cgraph->leafs[i]->buffer));
     }
 
-
-//    ggml_mpi_graph_compute_post(ctx, cgraph);
-
     return GGML_STATUS_SUCCESS;
 }
 
@@ -695,7 +610,7 @@ static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-    return strdup(("MPI(Rank " + std::to_string(((ggml_mpi_context*)backend->context)->rank) + ", local rank " + std::to_string(rank) + ")").c_str());
+    return strdup(("MPI(Rank " + std::to_string(ggml_backend_mpi_rank(backend)) + ", local rank " + std::to_string(ggml_backend_mpi_local_rank(backend)) + ")").c_str());
 }
 
 static void ggml_backend_mpi_free(ggml_backend_t backend) {
@@ -765,64 +680,39 @@ GGML_CALL bool ggml_backend_is_mpi(ggml_backend_t backend) {
 
 
 
-GGML_CALL static const char * ggml_backend_mpi_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
 
-    int rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-    return strdup(((ctx->name + " Buffer Type(Rank " + std::to_string(ctx->rank) + ", local rank " + std::to_string(rank) + "):") + std::string(ctx->wrapped_buffer->iface.get_name(ctx->wrapped_buffer))).c_str());
-}
 
 GGML_CALL static ggml_backend_buffer_t ggml_backend_mpi_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-//    fprintf(stderr, "ALLOCATING NEW BUFFER FOR BUFFER_TYPE %s\n", ggml_backend_buft_name(buft));
 
-    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
+    auto* buffer = ggml_backend_mpi_wrap_buffer(
+            ggml_backend_buft_alloc_buffer(ggml_backend_mpi_buffer_type_unwrap(buft), size)
+            );
 
-//    fprintf(stderr, "WRAPPED BUFFER_TYPE %s\n", ggml_backend_buft_name(ctx->wrapped_buffer));
-
-
-    auto* buffer = ggml_backend_mpi_wrap_buffer(ctx->wrapped_buffer->iface.alloc_buffer(ctx->wrapped_buffer, size));
-
-//    fprintf(stderr, "NEW BUFFER: %s, BUFFER_TYPE: %s\n", ggml_backend_buffer_name(buffer), ggml_backend_buft_name(buffer->buft));
-
-
-    ggml_backend_mpi_buffer_set_rank(buffer, ctx->rank);
-
-//    fprintf(stderr, "NEW BUFFER AFTER SETTING RANK: %s, BUFFER_TYPE: %s\n", ggml_backend_buffer_name(buffer), ggml_backend_buft_name(buffer->buft));
+    ggml_backend_mpi_buffer_copy_ctx_from_type(buft, buffer);
 
     return buffer;
 }
 
 GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
-    return ctx->wrapped_buffer->iface.get_alignment(ctx->wrapped_buffer);
+    return ggml_backend_buft_get_alignment(ggml_backend_mpi_buffer_type_unwrap(buft));
 }
 
 GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
-    return ctx->wrapped_buffer->iface.get_max_size(ctx->wrapped_buffer);
+    return ggml_backend_buft_get_max_size(ggml_backend_mpi_buffer_type_unwrap(buft));
 }
 
 GGML_CALL static size_t ggml_backend_mpi_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
-//    fprintf(stderr, "GETTING ALLOC SIZE FOR TENSOR %s (%s) AND BUFFER TYPE %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buft_name(buft));
-
-
-    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
-    return ctx->wrapped_buffer->iface.get_alloc_size(ctx->wrapped_buffer, tensor);
+    // Have to do this instead of calling ggml_backend_type_get_alloc_size because that signature doesn't have const on tensor
+    return ggml_backend_mpi_buffer_type_unwrap(buft)->iface.get_alloc_size(ggml_backend_mpi_buffer_type_unwrap(buft), tensor);
 }
 
 GGML_CALL static bool ggml_backend_mpi_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return backend != nullptr && ggml_backend_is_mpi(backend) && ((ggml_backend_mpi_buffer_type_context*) buft->context)->rank == ((ggml_mpi_context*)backend->context)->rank;
+    return backend != nullptr && ggml_backend_is_mpi(backend) && ggml_backend_mpi_buffer_type_rank(buft) == ggml_backend_mpi_rank(backend);
 }
 
 GGML_CALL static bool ggml_backend_mpi_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    auto * ctx = (ggml_backend_mpi_buffer_type_context *) buft->context;
 
-    int rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-    return ctx->rank == rank && ctx->wrapped_buffer->iface.is_host(ctx->wrapped_buffer);
+    return ggml_backend_mpi_buffer_type_rank(buft) == ggml_backend_mpi_buffer_type_local_rank(buft) && ggml_backend_buft_is_host(ggml_backend_mpi_buffer_type_unwrap(buft));
 }
 
 
@@ -854,11 +744,18 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer_type(ggml_back
 
 
 
-    auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{
+    auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type {
             /* .iface    = */ ggml_backend_mpi_buffer_type_interface,
-            /* .context  = */ new ggml_backend_mpi_buffer_type_context{"MPI",buft, 0},
+            /* .context  = */ new ggml_backend_mpi_buffer_type_context{
+                                /* .name                = */ "MPI",
+                                /* .wrapped_buffer_type = */ buft,
+                                /* .ctx_mpi             = */ ggml_mpi_init()
+                            }
     };
 
+    // Set rank to 0 as default
+    ggml_backend_mpi_buffer_type_set_rank(ggml_backend_wrapped_buffer_type, 0);
+
     cached_wrappers[buft] = ggml_backend_wrapped_buffer_type;
 
     return ggml_backend_wrapped_buffer_type;
@@ -883,37 +780,26 @@ GGML_CALL static void ggml_backend_mpi_buffer_set_tensor(ggml_backend_buffer_t b
 }
 
 GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
-    auto * type_ctx = (ggml_backend_mpi_buffer_type_context *) buffer->buft->context;
+    int rank = ggml_backend_mpi_buffer_local_rank(tensor->buffer);
 
-
-
-    int rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-    int src_rank = ((ggml_backend_mpi_buffer_type_context*)tensor->buffer->buft->context)->rank;
+    int src_rank = ggml_backend_mpi_buffer_rank(tensor->buffer);
 
     if (rank != src_rank) {
 
-        ggml_mpi_tensor_recv(tensor, data, ((ggml_backend_mpi_buffer_type_context*)tensor->buffer->buft->context)->rank, MPI_COMM_WORLD);
+        ggml_mpi_tensor_recv(tensor, data, ggml_backend_mpi_buffer_rank(tensor->buffer), ggml_backend_mpi_buffer_get_comm(tensor->buffer));
         return;
     }
 
-//    fprintf(stderr, "GETTING TENSOR WITH SRC RANK=RANK (%d) FOR TENSOR %s (%s) AND TGT BUFFER %s\n", src_rank, tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer));
-    ctx->wrapped_buffer->iface.get_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
+    ggml_backend_mpi_buffer_unwrap(buffer)->iface.get_tensor(ggml_backend_mpi_buffer_unwrap(buffer), tensor, data, offset, size);
 }
 
 GGML_CALL static bool ggml_backend_mpi_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
 
-//    fprintf(stderr, "DOING LOCAL TENSOR COPY FOR SRC %s (%s) AND DST %s (%s) WITH TGT BUFFER %s\n", src->name, ggml_backend_buffer_name(src->buffer), dst->name, ggml_backend_buffer_name(dst->buffer), ggml_backend_buffer_name(buffer));
-
-    return ctx->wrapped_buffer->iface.cpy_tensor(ctx->wrapped_buffer, src, dst);
+    return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src, dst);
 }
 
 GGML_CALL static void ggml_backend_mpi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
-    return ctx->wrapped_buffer->iface.clear(ctx->wrapped_buffer, value);
+    return ggml_backend_mpi_buffer_unwrap(buffer)->iface.clear(ggml_backend_mpi_buffer_unwrap(buffer), value);
 }
 
 static struct ggml_backend_buffer_i mpi_backend_buffer_i = {
@@ -930,10 +816,6 @@ static struct ggml_backend_buffer_i mpi_backend_buffer_i = {
 
 GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_t buf) {
 
-//    if (buf->iface.get_name == mpi_backend_buffer_i.get_name) {
-//        return buf;
-//    }
-
 //    if (cached_buffer_wrappers.find(buf) != cached_buffer_wrappers.end()) {
 //        fprintf(stderr, "Returning cached buffer with name %s\n", cached_buffer_wrappers[buf]->iface.get_name(cached_buffer_wrappers[buf]));
 //        auto * ret = new ggml_backend_buffer;
@@ -950,11 +832,15 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
     auto *buffer = new ggml_backend_buffer {
             /* .interface = */ mpi_backend_buffer_i,
             /* .buft      = */ t,
-            /* .context   = */ new ggml_backend_mpi_buffer_context{buf, ((ggml_backend_mpi_buffer_type_context*)t->context)->rank},
+            /* .context   = */ new ggml_backend_mpi_buffer_context{
+                                buf, ggml_mpi_init()},
             /* .size      = */ buf->size,
             /* .usage     = */ buf->usage
     };
 
+    // Default to node 0 when wrapping buffers
+    ggml_backend_mpi_buffer_set_rank(buffer, 0);
+
     cached_buffer_wrappers[buf] = buffer;
 
 
@@ -963,16 +849,12 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
 }
 
 bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    int src_rank = ((ggml_backend_mpi_buffer_type_context*)src->buffer->buft->context)->rank;
-    int dst_rank = ((ggml_backend_mpi_buffer_type_context*)dst->buffer->buft->context)->rank;
-
-//    fprintf(stderr, "Running tensor async copy for src %s (buffer %s) and dst %s (buffer %s) with backend %s\n", src->name, ggml_backend_buffer_name(src->buffer), dst->name, ggml_backend_buffer_name(dst->buffer), backend->iface.get_name(backend));
+    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
+    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
 
     auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
 
     if (ctx->remote) {
-
-//        fprintf(stderr, "Skipping tensor copy for remote backend %s.\n", backend->iface.get_name(backend));
         return true;
     }
 
@@ -991,9 +873,7 @@ bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml
 }
 
 void ggml_backend_mpi_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * dst, const void* data, size_t offset, size_t size) {
-    int dst_rank = ((ggml_backend_mpi_buffer_type_context*)dst->buffer->buft->context)->rank;
-
-//    fprintf(stderr, "Running set tensor for dst %s (buffer %s) with backend %s\n", dst->name, ggml_backend_buffer_name(dst->buffer), backend->iface.get_name(backend));
+    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
 
 
     auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
@@ -1079,7 +959,7 @@ int ggml_backend_mpi_reg_devices() {
 
 GGML_CALL void ggml_backend_mpi_buffer_type_set_rank(ggml_backend_buffer_type_t buft, int rank) {
     if (buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
-        ((ggml_backend_mpi_buffer_type_context *) buft->context)->rank = rank;
+        ((ggml_backend_mpi_buffer_type_context *) buft->context)->ctx_mpi->rank = rank;
     } else {
         GGML_ASSERT(!"Buffer type must be wrapped in ggml_backend_mpi_buffer_type");
     }
@@ -1087,9 +967,11 @@ GGML_CALL void ggml_backend_mpi_buffer_type_set_rank(ggml_backend_buffer_type_t
 
 GGML_CALL void ggml_backend_mpi_buffer_set_rank(ggml_backend_buffer_t buf, int rank) {
     if (buf->iface.get_name == ggml_backend_mpi_buffer_name) {
-        ((ggml_backend_mpi_buffer_context *) buf->context)->rank = rank;
+        ((ggml_backend_mpi_buffer_context *) buf->context)->ctx_mpi->rank = rank;
         ggml_backend_mpi_buffer_type_set_rank(buf->buft, rank);
     } else {
         GGML_ASSERT(!"Buffer type must be wrapped in ggml_backend_mpi_buffer_type");
     }
 }
+
+
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 6497f47c8..b4f616d60 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -95,8 +95,6 @@ void ggml_mpi_backend_free(void);
  */
 struct ggml_mpi_context * ggml_mpi_init(void);
 
-void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
-
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer_type(ggml_backend_buffer_type_t buft);
 
 GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_t buf);
@@ -178,7 +176,7 @@ void ggml_mpi_eval_init(
                 int8_t          **  logits,
                 uint32_t            n_seq_max);
 
-void ggml_mpi_synch_int(
+void ggml_mpi_sync_int(
         struct ggml_mpi_context     * ctx_mpi,
                 int32_t * val
         );
@@ -207,45 +205,6 @@ uint16_t** ggml_mpi_split_range(
     float node_weights[]
 );
 
-/**
- * Scatter the layer ranges across all nodes
- * in the given context. This is a collective operation
- * and must be called by all nodes that are within the same
- * communicator. The given layer ranges must be in the same
- * format as created by the ggml_mpi_split_range().
- *
- * @param ctx_mpi The context to scatter the layers across.
- * @param layer_ranges The pre-split ranges to scatter to the nodes.
- */
-void ggml_mpi_scatter_layers(
-    struct ggml_mpi_context * ctx_mpi,
-    uint16_t ** layer_ranges
-);
-
-/**
- * Modify compute graph to only process allocated
- * layers.
- *
- * @param ctx_mpi The context containing the allocated layer range.
- * @param gf The compute graph to modify
- * @param n_layers The number of layers in the model, used as an upper bound in the layer ranges.
- */
-void ggml_mpi_graph_compute_pre(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf);
-
-/**
- * Sends the output tensor to the next node for processing
- * of later layers.
- *
- * @param ctx_mpi The context to use for MPI operations.
- * @param gf The graph used in the computations
- * @param n_layers The number of layers in the model.
- */
-void ggml_mpi_graph_compute_post(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf);
-
 // BACKEND V2
 
 struct ggml_mpi_device {

From 4692644ff9262c0db9d941c691194ec0e05bcce0 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 13 Mar 2024 01:38:38 -0500
Subject: [PATCH 28/35] Remove hard-coded layer splits and support more than 2
 nodes

---
 common/common.cpp |  4 +++
 ggml-mpi.cpp      | 66 ++++++++++++++++++++++++-----------------------
 ggml-mpi.h        |  2 +-
 ggml.h            |  2 +-
 llama.cpp         | 59 ++++++++++++++++++++++++------------------
 llama.h           |  2 +-
 6 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 46ec366b0..4a8e93cc5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1319,6 +1319,10 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
         mparams.kv_overrides = params.kv_overrides.data();
     }
 
+    free((void *) mparams.node_layer_weights);
+
+    mparams.node_layer_weights = params.mpi_layer_split.data();
+
     return mparams;
 }
 
diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 30e74c8bc..95dcb0fd3 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -285,34 +285,27 @@ uint16_t** ggml_mpi_split_range(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t start,
     uint16_t end,
-    float node_weights[]
+    const float node_weights[]
 ) {
     // Splits the range given by start and end
     // over the available nodes. This implementation
     // assumes that node 0 handles the final part of the range
     // while node 1 handles the beginning, to form a ring pipeline
 
-    // Only node 0 deals with the device splits, other nodes
-    // get the splits from the scatter layers operation
-
-    if (ctx_mpi->rank != 0) {
-        return NULL;
-    }
-
     uint16_t range_length = end - start + 1;
     uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size);
     for (int i = 0; i < ctx_mpi->size; i++) {
         ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
     }
     uint16_t next_layer = 0;
-    for (int i=1; i < ctx_mpi->size; i++) {
+    for (int i=0; i < ctx_mpi->size; i++) {
         ranges[i][0] = next_layer;
         ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
         next_layer = ranges[i][1];
     }
 
-    ranges[0][0] = next_layer;
-    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
+//    ranges[0][0] = next_layer;
+//    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
     return ranges;
 
 }
@@ -775,8 +768,13 @@ GGML_CALL static void ggml_backend_mpi_buffer_free_buffer(ggml_backend_buffer_t
 
 GGML_CALL static void ggml_backend_mpi_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+
+    if (ggml_backend_mpi_buffer_rank(buffer) != ggml_backend_mpi_buffer_local_rank(buffer)) {
+        return;
+    }
+
 //    fprintf(stderr, "SETTING TENSOR WITHOUT MPI CALLS FOR %s (%s) AND TGT BUFFER %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer));
-    return ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
+    ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
 }
 
 GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -794,8 +792,12 @@ GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t b
 }
 
 GGML_CALL static bool ggml_backend_mpi_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_mpi_buffer_rank(src->buffer) == ggml_backend_mpi_buffer_rank(dst->buffer)) {
+        return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src,
+                                                                        dst);
+    }
 
-    return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src, dst);
+    return true;
 }
 
 GGML_CALL static void ggml_backend_mpi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -849,25 +851,25 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
 }
 
 bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
-    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
-
-    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
-
-    if (ctx->remote) {
-        return true;
-    }
-
-    if (src_rank == dst_rank) {
-//        src->buffer->iface.cpy_tensor(src->buffer, src, dst);
-        return true;
-    }
-
-    if (src_rank == ctx->rank) {
-        ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
-    } else if (dst_rank == ctx->rank){
-        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
-    }
+//    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
+//    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
+//
+//    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+//
+//    if (ctx->remote) {
+//        return true;
+//    }
+//
+//    if (src_rank == dst_rank) {
+////        src->buffer->iface.cpy_tensor(src->buffer, src, dst);
+//        return true;
+//    }
+//
+//    if (src_rank == ggml_backend_mpi_local_rank(backend)) {
+//        ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
+//    } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
+//        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
+//    }
     return true;
 
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index b4f616d60..fe8358f2d 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -202,7 +202,7 @@ uint16_t** ggml_mpi_split_range(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t start,
     uint16_t end,
-    float node_weights[]
+    const float node_weights[]
 );
 
 // BACKEND V2
diff --git a/ggml.h b/ggml.h
index 3544e9d6a..e4aabab05 100644
--- a/ggml.h
+++ b/ggml.h
@@ -226,7 +226,7 @@
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       128
+#define GGML_MAX_CONTEXTS       256
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
diff --git a/llama.cpp b/llama.cpp
index 9e0343cad..96b2adbbe 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2001,6 +2001,10 @@ struct llama_model {
     int64_t t_load_us = 0;
     int64_t t_start_us = 0;
 
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = nullptr;
+#endif
+
     ~llama_model() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
@@ -2099,9 +2103,7 @@ struct llama_context {
     struct ggml_tensor * inp_s_mask;    // F32 [1, kv_size]
     struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]
 
-#ifdef GGML_USE_MPI
-    ggml_mpi_context * ctx_mpi = NULL;
-#endif
+
 };
 
 //
@@ -3277,6 +3279,11 @@ static void llm_load_hparams(
     auto & hparams = model.hparams;
     const gguf_context * ctx = ml.ctx_gguf;
 
+#ifdef GGML_USE_MPI
+    model.ctx_mpi = ggml_mpi_init();
+
+#endif
+
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
         enum gguf_type type = gguf_get_kv_type(ctx, i);
@@ -4008,6 +4015,7 @@ static bool llm_load_tensors(
         enum llama_split_mode split_mode,
         int main_gpu,
         const float * tensor_split,
+        const float * node_split,
         bool use_mlock,
         llama_progress_callback progress_callback,
         void * progress_callback_user_data) {
@@ -4097,11 +4105,17 @@ static bool llm_load_tensors(
     }
 
 #ifdef GGML_USE_MPI
-    // TESTING: Setting all non-input/output layers to node 1
-    for (int64_t i = 0; i < n_layer; i++) {
-        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1);
-        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1);
+    uint16_t** ranges = ggml_mpi_split_range(model.ctx_mpi, 0, n_layer - 1, node_split);
 
+
+    size_t size = ggml_mpi_size(model.ctx_mpi);
+
+    for (size_t i = 0; i < size; i++) {
+        for (uint16_t j = ranges[i][0]; j < ranges[i][1]; j++) {
+            printf("Setting buffer rank for i %zu and j %d\n", i, j);
+            ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft, (int)i);
+            ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft_matrix, (int)i);
+        }
     }
 
 
@@ -5101,7 +5115,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 #endif
 
         if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
+            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.node_layer_weights, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
         )) {
             return -2;
@@ -8813,7 +8827,7 @@ static int llama_decode_internal(
     uint32_t n_tokens_all = batch_all.n_tokens;
 
 #ifdef GGML_USE_MPI
-    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
+    ggml_mpi_eval_init(lctx.model.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
     n_tokens_all = batch_all.n_tokens;
 #endif
 
@@ -9003,7 +9017,7 @@ static int llama_decode_internal(
         //       update the graphs to skip "result_output" if logits are not needed
         if (res) {
     #ifdef GGML_USE_MPI
-        if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+        if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
 #endif
 
         ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
@@ -12636,7 +12650,7 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
-            static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
+            static_cast<float *>(calloc(1, sizeof(float))),
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -12706,7 +12720,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 
 int llama_node_id(struct llama_context * ctx) {
 #ifdef GGML_USE_MPI
-    return ggml_mpi_rank(ctx->ctx_mpi);
+    return ggml_mpi_rank(ctx->model.ctx_mpi);
 
 #endif
     return 0;
@@ -13026,8 +13040,13 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef GGML_USE_MPI
 
+        std::vector<ggml_backend_t> new_backends;
 
-        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)};
+        for (size_t i = 0; i < ggml_mpi_size(model->ctx_mpi); i++) {
+            new_backends.push_back(ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), (int) i));
+        }
+
+        ctx->backends = new_backends;
 
 
 
@@ -13144,23 +13163,13 @@ struct llama_context * llama_new_context_with_model(
         }
     }
 
-#ifdef GGML_USE_MPI
-    ctx->ctx_mpi = ggml_mpi_init();
 
-#endif
 
     return ctx;
 }
 
 void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
-//#ifdef GGML_USE_MPI
-//    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
-//        GGML_ASSERT(false && "Must have same number of split percentages as devices");
-//    }
-//    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
-//    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
-//    free(ranges);
-//#endif
+
 }
 
 void llama_free(struct llama_context * ctx) {
@@ -13998,7 +14007,7 @@ int32_t llama_decode(
           struct llama_batch   batch) {
 
 #ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
         std::vector<llama_token> tmp(n_ctx, llama_token_bos(&ctx->model));
diff --git a/llama.h b/llama.h
index 2f2e775ca..48ac9a324 100644
--- a/llama.h
+++ b/llama.h
@@ -203,7 +203,7 @@ extern "C" {
 
     struct llama_model_params {
         // Array of layers to allocate to each node
-        int32_t* n_node_layers;
+        const float * node_layer_weights;
 
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs

From e8a61568e95dee14d8ee06e07b19418027760f3d Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 14 Mar 2024 19:56:23 -0500
Subject: [PATCH 29/35] Use CXX and CXXFLAGS for ggml-mpi compilation in
 Makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f8ad9f146..3b7ff0c4d 100644
--- a/Makefile
+++ b/Makefile
@@ -574,7 +574,7 @@ endif # LLAMA_METAL
 
 ifdef LLAMA_MPI
 ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
 GF_CC := $(CC)

From 2217b02c99d5e85c03168302b51ab58757b81800 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Thu, 14 Mar 2024 22:24:54 -0500
Subject: [PATCH 30/35] Change requirement of last backend being CPU to
 requiring its default buffer type be a host buffer, fix rebase errors

---
 ggml-backend.c |  2 +-
 ggml-mpi.cpp   |  7 ++++---
 llama.cpp      | 18 +++++++++++-------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 31f8d5a6d..7429a1f44 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1696,7 +1696,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
         bool parallel) {
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
-    GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
+    GGML_ASSERT(ggml_backend_buft_is_host(ggml_backend_get_default_buffer_type(backends[n_backends - 1]))); // last backend must be host
 
     struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
 
diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 95dcb0fd3..6e12e93f5 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -548,7 +548,7 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
 
     if (!ctx->remote) {
         ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(),
-                                                            (int) ctx->backends.size(), cgraph->n_nodes);
+                                                            (int) ctx->backends.size(), cgraph->n_nodes, false);
 
         ggml_backend_sched_reserve(sched, cgraph);
         ggml_backend_sched_graph_compute(sched, cgraph);
@@ -850,7 +850,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
     return buffer;
 }
 
-bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
 //    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
 //    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
 //
@@ -870,7 +870,8 @@ bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml
 //    } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
 //        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
 //    }
-    return true;
+//    fprintf(stderr, "ATTEMPTING ASYNC COPY FOR SRC TENSOR %s TO DST TENSOR %s WITH SRC BACKEND %s AND DST BACKEND %s\n", src->name, dst->name, ggml_backend_name(backend_src), ggml_backend_name(backend_dst));
+    return false;
 
 }
 
diff --git a/llama.cpp b/llama.cpp
index 96b2adbbe..6ffd905fd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9012,13 +9012,15 @@ static int llama_decode_internal(
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
+#ifdef GGML_USE_MPI
+        if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
+#endif
+
         // extract logits
         // TODO: do not compute and extract logits if only embeddings are needed
         //       update the graphs to skip "result_output" if logits are not needed
         if (res) {
-    #ifdef GGML_USE_MPI
-        if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
-#endif
+
 
         ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
             GGML_ASSERT(backend_res != nullptr);
@@ -9104,6 +9106,10 @@ static int llama_decode_internal(
                     } break;
             }
         }
+
+#ifdef GGML_USE_MPI
+        }
+#endif
     }
 
     // wait for the computation to finish (automatically done when obtaining the model output)
@@ -9121,9 +9127,7 @@ static int llama_decode_internal(
         }
     }
 
-#ifdef GGML_USE_MPI
-    }
-#endif
+
 
     return 0;
 }
@@ -13051,7 +13055,7 @@ struct llama_context * llama_new_context_with_model(
 
 
 //        ctx->backend_cpu = ctx->backends.back();
-        ctx->backends.push_back(ctx->backend_cpu);
+        ctx->backends.push_back(ggml_backend_mpi_init(&ctx->backend_cpu, 1, ggml_mpi_rank(model->ctx_mpi)));
 
 #endif
 

From cc551dfdfeb44f341d3ed13c1887276ef0036251 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 18 Mar 2024 21:56:47 -0500
Subject: [PATCH 31/35] Fix breaks in gpt_params_find_arg

---
 common/common.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 3738ab8b6..39eb7c909 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -169,7 +169,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
     if (arg == "-t" || arg == "--threads") {
         if (++i >= argc) {
             invalid_param = true;
-            break;
+            return true;
         }
         std::string arg_next = argv[i];
 
@@ -189,7 +189,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
     if (arg == "-tb" || arg == "--threads-batch") {
         if (++i >= argc) {
             invalid_param = true;
-            break;
+            return true;
         }
         std::string arg_next = argv[i];
 
@@ -208,7 +208,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
     if (arg == "-td" || arg == "--threads-draft") {
         if (++i >= argc) {
             invalid_param = true;
-            break;
+            return true;
         }
         std::string arg_next = argv[i];
 
@@ -227,7 +227,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
     if (arg == "-tbd" || arg == "--threads-batch-draft") {
         if (++i >= argc) {
             invalid_param = true;
-            break;
+            return true;
         }
         std::string arg_next = argv[i];
 
@@ -912,7 +912,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
     if (arg == "--mpi-layer-split") {
          if (++i >= argc) {
              invalid_param = true;
-             break;
+             return true;
          }
          std::string arg_next = argv[i];
 

From be63161d0463acef209b32adde1949909366f703 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 19 Mar 2024 11:02:18 -0500
Subject: [PATCH 32/35] Fix incorrect sched hash size, refactor new cmdline
 params to align with new style

---
 common/common.cpp | 31 ++++++++++++++++++-------------
 ggml-mpi.cpp      |  9 +++++++--
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 39eb7c909..dd742ff25 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -184,6 +184,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
                 params.n_threads[node] = std::thread::hardware_concurrency();
             }
         }
+        return true;
 
     }
     if (arg == "-tb" || arg == "--threads-batch") {
@@ -204,6 +205,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
                 params.n_threads_batch[node] = std::thread::hardware_concurrency();
             }
         }
+        return true;
     }
     if (arg == "-td" || arg == "--threads-draft") {
         if (++i >= argc) {
@@ -223,6 +225,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
                 params.n_threads_draft[node] = std::thread::hardware_concurrency();
             }
         }
+        return true;
     }
     if (arg == "-tbd" || arg == "--threads-batch-draft") {
         if (++i >= argc) {
@@ -242,6 +245,7 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
                 params.n_threads_batch_draft[node] = std::thread::hardware_concurrency();
             }
         }
+        return true;
     }
     if (arg == "-p" || arg == "--prompt") {
         if (++i >= argc) {
@@ -910,20 +914,21 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
         return true;
     }
     if (arg == "--mpi-layer-split") {
-         if (++i >= argc) {
-             invalid_param = true;
-             return true;
-         }
-         std::string arg_next = argv[i];
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        std::string arg_next = argv[i];
 
-          // split string by , and /
-         const std::regex regex{R"([,/]+)"};
-         std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-         std::vector<std::string> split_arg{it, {}};
-         params.mpi_layer_split.resize(split_arg.size());
-         for (size_t node = 0; node < split_arg.size(); ++node) {
-             params.mpi_layer_split[node] = std::stof(split_arg[node]);
-         }
+        // split string by , and /
+        const std::regex regex{R"([,/]+)"};
+        std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+        std::vector<std::string> split_arg{it, {}};
+        params.mpi_layer_split.resize(split_arg.size());
+        for (size_t node = 0; node < split_arg.size(); ++node) {
+            params.mpi_layer_split[node] = std::stof(split_arg[node]);
+        }
+        return true;
     }
 
     if (arg == "--tensor-split" || arg == "-ts") {
diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 6e12e93f5..7fb4e752c 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -513,6 +513,8 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
         }
     }
 
+    size_t n_srcs = 0;
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (cgraph->nodes[i]->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
             cgraph->nodes[i]->buffer = ggml_backend_mpi_buffer_unwrap(cgraph->nodes[i]->buffer);
@@ -523,6 +525,7 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
                 break;
             }
             if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                n_srcs++;
                 src->buffer = ggml_backend_mpi_buffer_unwrap(src->buffer);
             }
         }
@@ -532,6 +535,7 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
             if (src->buffer->buft != nullptr) {
 
                 if (src->buffer->buft->iface.get_name == ggml_backend_mpi_buffer_type_name) {
+                    n_srcs++;
                     src->buffer = ggml_backend_mpi_buffer_unwrap(src->buffer);
                 }
             }
@@ -546,12 +550,13 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
     }
 
 
+
     if (!ctx->remote) {
         ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(),
-                                                            (int) ctx->backends.size(), cgraph->n_nodes, false);
+                                                            (int) ctx->backends.size(), cgraph->n_nodes + cgraph->n_leafs + n_srcs, false);
 
         ggml_backend_sched_reserve(sched, cgraph);
-        ggml_backend_sched_graph_compute(sched, cgraph);
+        ggml_backend_sched_graph_compute_async(sched, cgraph);
         ggml_backend_sched_free(sched);
 
     }

From 57ac2e7268a273f9cd2081c160a8f70f74eb45b1 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 19 Mar 2024 11:13:38 -0500
Subject: [PATCH 33/35] Fix all examples to use new thread vector params,
 remove mpi example

---
 examples/batched-bench/batched-bench.cpp |   4 +-
 examples/batched/batched.cpp             |   4 +-
 examples/llava/llava-cli.cpp             |   4 +-
 examples/mpi/CMakeLists.txt              |   8 -
 examples/mpi/README.md                   |  60 --
 examples/mpi/mpi.cpp                     | 876 -----------------------
 examples/passkey/passkey.cpp             |   4 +-
 examples/server/server.cpp               |   4 +-
 examples/speculative/speculative.cpp     |   2 +-
 9 files changed, 11 insertions(+), 955 deletions(-)
 delete mode 100644 examples/mpi/CMakeLists.txt
 delete mode 100644 examples/mpi/README.md
 delete mode 100644 examples/mpi/mpi.cpp

diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 19674dfd3..df9f95bb0 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -102,8 +102,8 @@ int main(int argc, char ** argv) {
     ctx_params.n_ctx     = n_kv_max;
     ctx_params.n_batch   = 512;
 
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_threads       = params.n_threads[0];
+    ctx_params.n_threads_batch = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
 
     // ensure enough sequences are available
     ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index ee1f8f1bf..ac7df9c9f 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -81,8 +81,8 @@ int main(int argc, char ** argv) {
     ctx_params.n_ctx = n_kv_req;
     ctx_params.n_batch = std::max(n_len, n_parallel);
     ctx_params.n_seq_max       = n_parallel;
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_threads       = params.n_threads[0];
+    ctx_params.n_threads_batch = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index e29da6cb2..3dd1013ea 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -125,14 +125,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
         if (!params->image.empty()) {
             fprintf(stderr, "using base64 encoded image instead of command line image path\n");
         }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads[0], prompt);
         if (!embed) {
             fprintf(stderr, "%s: can't load image from prompt\n", __func__);
             return NULL;
         }
         params->prompt = remove_image_from_prompt(prompt);
     } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads[0], params->image.c_str());
         if (!embed) {
             fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
             return NULL;
diff --git a/examples/mpi/CMakeLists.txt b/examples/mpi/CMakeLists.txt
deleted file mode 100644
index 07d83b61d..000000000
--- a/examples/mpi/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(TARGET mpi)
-add_executable(${TARGET} mpi.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
diff --git a/examples/mpi/README.md b/examples/mpi/README.md
deleted file mode 100644
index 4b934b0ed..000000000
--- a/examples/mpi/README.md
+++ /dev/null
@@ -1,60 +0,0 @@
-# llama.cpp/example/mpi
-
-This example program allows you to use various LLaMA language models in an easy and efficient way across an MPI cluster.
-It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
-
-## Table of Contents
-
-1. [Quick Start](#quick-start)
-2. [Common Options](#common-options)
-
-## Quick Start
-
-To get started right away, write the following to a file on each node, making sure to use the correct path for the model you have:
-```bash
---mpi-layer-split 0.8,0.2 -t 4 -m ~/llm-local/codellama-7b.Q3_K_M.gguf --color -c 512 --temp 0.0 --repeat_penalty 1.0 -n 128 -p "double fast_inverse_square_root(double x"
-```
-
-Each node may have different options, currently they must have the same number of arguments to the mpi-layer-split option and the same
-model path, but that will eventually be synchronized from the head node.
-
-Next, write the hostsfile on the head node. Make sure there is only one slot on each node.
-
-Finally, run the following command on the head node to start the program across the cluster:
-
-#### Unix-based systems (Linux, macOS, etc.):
-
-```bash
-mpirun -hostfile hostsfile -mca orte_keep_fqdn_hostnames t --bind-to none ./mpi options.txt
-```
-
-Where `hostsfile` is the file containing the cluster hostname configuration and `options.txt` is the path
-where each node can find its own options. Storing the model on a network filesystem has not yet been
-tested and optimized for.
-
-#### Windows:
-Not supported currently.
-
-For an interactive experience, try this command:
-
-#### Unix-based systems (Linux, macOS, etc.):
-
-```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
-'User: Hi
-AI: Hello. I am an AI chatbot. Would you like to talk?
-User: Sure!
-AI: What would you like to talk about?
-User:'
-```
-
-## Common Options
-
-In this section, we cover the most commonly used options for running the `mpi` program with the LLaMA models:
-
--   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
--   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
--   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
--   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
--   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
--   `--mpi-layer-split`: Set the percentage of layers to distribute to each node. Must have the same number of arguments as the number of nodes in the cluster. Only the layer split percentages passed to the head node are used, they are scattered to all other nodes in the cluster.
diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp
deleted file mode 100644
index b4944099e..000000000
--- a/examples/mpi/mpi.cpp
+++ /dev/null
@@ -1,876 +0,0 @@
-#include "common.h"
-
-#include "console.h"
-#include "llama.h"
-#include "build-info.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-// TODO add Windows support
-#include <wordexp.h>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
-static gpt_params               * g_params;
-static std::vector<llama_token> * g_input_tokens;
-static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting = false;
-
-
-static void write_logfile(
-        const llama_context * ctx, const gpt_params & params, const llama_model * model,
-        const std::vector<llama_token> & input_tokens, const std::string & output,
-        const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = get_sortable_timestamp();
-
-    const bool success = create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void sigint_handler(int signo) {
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting = true;
-        } else {
-            console::cleanup();
-            printf("\n");
-            llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
-            _exit(130);
-        }
-    }
-}
-#endif
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    g_params = &params;
-
-    if (argc > 2) {
-        fprintf(stderr, "Must only have one argument, the file to read options from.\n");
-        return 2;
-    }
-
-    // Manually add the path used to launch this program to the
-    // options
-    std::string rawOptions = argv[0];
-    rawOptions += ' ';
-    std::ifstream optionsFile(argv[1]);
-    if (optionsFile.is_open()) {
-        // Read in the options file, appending to the launch path
-        std::ostringstream buf;
-        buf << optionsFile.rdbuf();
-        rawOptions += buf.str();
-        optionsFile.close();
-
-    } else {
-        fprintf(stderr, "Cannot open options file at path %s\n", argv[1]);
-        return 3;
-    }
-
-    // wordexp doesn't work right if there's a trailing newline, so strip it
-    rawOptions.erase(rawOptions.find_last_not_of(" \t\n\r\f\v") + 1);
-
-    wordexp_t  splitOptions;
-    wordexp(rawOptions.c_str(), &splitOptions, WRDE_NOCMD);
-
-    // Now we can parse like normal, but using the loaded options instead of the passed argv
-    if (gpt_params_parse(splitOptions.we_wordc, splitOptions.we_wordv, params) == false) {
-        wordfree(&splitOptions);
-        return 1;
-    }
-    wordfree(&splitOptions);
-    llama_sampling_params & sparams = params.sparams;
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
-    // TODO: Dump params ?
-    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
-
-    // save choice to use color for later
-    // (note for later: this is a slightly awkward choice)
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
-
-    if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
-    }
-
-    if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
-    }
-
-    if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
-    }
-
-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    LOG("%s: llama backend init\n", __func__);
-    llama_backend_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
-    g_model = &model;
-    g_ctx = &ctx;
-
-    // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }
-
-    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
-
-    if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
-    }
-
-    // print system information
-    {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
-    }
-
-    llama_split_layers_weighted(ctx, params.mpi_layer_split.data(), params.mpi_layer_split.size());
-
-    std::string path_session = params.path_prompt_cache;
-    std::vector<llama_token> session_tokens;
-
-    if (!path_session.empty()) {
-        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
-
-        // fopen to check for existing session
-        FILE * fp = std::fopen(path_session.c_str(), "rb");
-        if (fp != NULL) {
-            std::fclose(fp);
-
-            session_tokens.resize(n_ctx);
-            size_t n_token_count_out = 0;
-            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
-                return 1;
-            }
-            session_tokens.resize(n_token_count_out);
-            llama_set_rng_seed(ctx, params.seed);
-
-            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
-        } else {
-            LOG_TEE("%s: session file does not exist, will create\n", __func__);
-        }
-    }
-
-    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
-    LOG("add_bos: %d\n", add_bos);
-
-    std::vector<llama_token> embd_inp;
-
-    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-    } else {
-        LOG("use session tokens\n");
-        embd_inp = session_tokens;
-    }
-
-    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-
-    // Should not run without any tokens
-    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-    }
-
-    // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }
-
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
-    }
-
-    // debug message about similarity of saved session, if applicable
-    size_t n_matching_session_tokens = 0;
-    if (!session_tokens.empty()) {
-        for (llama_token id : session_tokens) {
-            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
-                break;
-            }
-            n_matching_session_tokens++;
-        }
-        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_TEE("%s: using full prompt from session file\n", __func__);
-        } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
-        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                    __func__, n_matching_session_tokens, embd_inp.size());
-        } else {
-            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
-                    __func__, n_matching_session_tokens, embd_inp.size());
-        }
-
-        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
-    }
-
-    LOGLN(
-            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
-            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
-
-    // if we will use the cache for the full prompt without reaching the end of the cache, force
-    // reevaluation of the last token token to recalculate the cached logits
-    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
-
-        session_tokens.resize(embd_inp.size() - 1);
-    }
-
-    // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
-        params.n_keep = (int)embd_inp.size();
-    }
-
-    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
-
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
-    // in instruct mode, we inject a prefix and a suffix to each input by the user
-    if (params.instruct) {
-        params.interactive_first = true;
-        params.antiprompt.push_back("### Instruction:\n\n");
-    }
-
-    // enable interactive mode if interactive start is specified
-    if (params.interactive_first) {
-        params.interactive = true;
-    }
-
-    if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
-        }
-
-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
-        }
-
-        if (params.n_keep > 0) {
-            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
-            for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
-            }
-            LOG_TEE("'\n");
-        }
-        LOG_TEE("\n");
-    }
-
-    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
-        };
-        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
-        LOG_TEE("%s: interactive mode on.\n", __func__);
-
-        if (!params.antiprompt.empty()) {
-            for (const auto & antiprompt : params.antiprompt) {
-                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
-                if (params.verbose_prompt) {
-                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
-                    for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
-                    }
-                }
-            }
-        }
-
-        if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
-        }
-
-        if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
-            if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
-                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
-                }
-            }
-        }
-
-        if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-            if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
-                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
-                }
-            }
-        }
-    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
-
-    if (params.interactive) {
-        const char *control_message;
-        if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n";
-        } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n"
-                              " - If you want to submit another line, end your input with '\\'.\n";
-        }
-        LOG_TEE("== Running in interactive mode. ==\n");
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
-#endif
-        LOG_TEE(       "%s\n", control_message);
-
-        is_interacting = params.interactive_first;
-    }
-
-    bool is_antiprompt        = false;
-    bool input_echo           = true;
-    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
-
-    int n_past             = 0;
-    int n_remain           = params.n_predict;
-    int n_consumed         = 0;
-    int n_session_consumed = 0;
-    int n_past_guidance    = 0;
-
-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
-    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
-    std::ostringstream output_ss;     g_output_ss     = &output_ss;
-
-    // the first thing we will do is to output the prompt, so set color accordingly
-    console::set_display(console::prompt);
-
-    std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-
-    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
-        // predict
-        if (!embd.empty()) {
-            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
-            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
-
-            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int) embd.size() > max_embd_size) {
-                const int skipped_tokens = (int) embd.size() - max_embd_size;
-                embd.resize(max_embd_size);
-
-                console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console::set_display(console::reset);
-                fflush(stdout);
-            }
-
-            // infinite text generation via context swapping
-            // if we run out of context:
-            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
-                if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                    break;
-                }
-
-                const int n_left    = n_past - params.n_keep - 1;
-                const int n_discard = n_left/2;
-
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                    n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
-
-                n_past -= n_discard;
-
-                if (ctx_guidance) {
-                    n_past_guidance -= n_discard;
-                }
-
-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
-
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-                LOG("clear session path\n");
-                path_session.clear();
-            }
-
-            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
-            if (n_session_consumed < (int) session_tokens.size()) {
-                size_t i = 0;
-                for ( ; i < embd.size(); i++) {
-                    if (embd[i] != session_tokens[n_session_consumed]) {
-                        session_tokens.resize(n_session_consumed);
-                        break;
-                    }
-
-                    n_past++;
-                    n_session_consumed++;
-
-                    if (n_session_consumed >= (int) session_tokens.size()) {
-                        ++i;
-                        break;
-                    }
-                }
-                if (i > 0) {
-                    embd.erase(embd.begin(), embd.begin() + i);
-                }
-            }
-
-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                                embd_guidance.end(),
-                                embd.begin() + original_prompt_len,
-                                embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
-            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
-                    return 1;
-                }
-
-                n_past += n_eval;
-
-                LOG("n_past = %d\n", n_past);
-            }
-
-            if (!embd.empty() && !path_session.empty()) {
-                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
-                n_session_consumed = session_tokens.size();
-            }
-        }
-
-        embd.clear();
-        embd_guidance.clear();
-
-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // optionally save the session on first sample (for faster prompt loading next time)
-            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
-                need_to_save_session = false;
-                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
-
-                LOG("saved session to %s\n", path_session.c_str());
-            }
-
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
-
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
-
-            embd.push_back(id);
-
-            // echo this to console
-            input_echo = true;
-
-            // decrement remaining sampling budget
-            --n_remain;
-
-            LOG("n_remain: %d\n", n_remain);
-        } else {
-            // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-
-                // push the prompt in the sampling context in order to apply repetition penalties later
-                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
-
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        if (input_echo) {
-            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
-
-                if (embd.size() > 1) {
-                    input_tokens.push_back(id);
-                } else {
-                    output_tokens.push_back(id);
-                    output_ss << token_str;
-                }
-            }
-            fflush(stdout);
-        }
-        // reset color to default if there is no pending user input
-        if (input_echo && (int) embd_inp.size() == n_consumed) {
-            console::set_display(console::reset);
-        }
-
-        // if not currently processing queued inputs;
-        if ((int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt in the last n_prev tokens
-            if (!params.antiprompt.empty()) {
-                const int n_prev = 32;
-                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
-
-                is_antiprompt = false;
-                // Check if each of the reverse prompts appears at the end of the output.
-                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
-                // so we'll compensate for that by widening the search window a bit.
-                for (std::string & antiprompt : params.antiprompt) {
-                    size_t extra_padding = params.interactive ? 0 : 2;
-                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
-                                              ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
-                                              : 0;
-
-                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
-                        if (params.interactive) {
-                            is_interacting = true;
-                        }
-                        is_antiprompt = true;
-                        break;
-                    }
-                }
-
-                if (is_antiprompt) {
-                    LOG("found antiprompt: %s\n", last_output.c_str());
-                }
-            }
-
-            // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
-                LOG("found EOS token\n");
-
-                if (params.interactive) {
-                    if (!params.antiprompt.empty()) {
-                        // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
-                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-                        is_antiprompt = true;
-                    }
-
-                    is_interacting = true;
-                    printf("\n");
-                } else if (params.instruct) {
-                    is_interacting = true;
-                }
-            }
-
-            if (n_past > 0 && is_interacting) {
-                LOG("waiting for user input\n");
-
-                if (params.instruct) {
-                    printf("\n> ");
-                }
-
-                if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
-                }
-
-                std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    printf("%s", params.input_prefix.c_str());
-                }
-
-                // color user input only
-                console::set_display(console::user_input);
-
-                std::string line;
-                bool another_line = true;
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-
-                // done taking input, reset color
-                console::set_display(console::reset);
-
-                // Add tokens to embd only if the input buffer is non-empty
-                // Entering a empty line lets the user pass control back
-                if (buffer.length() > 1) {
-                    // append input suffix if any
-                    if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        printf("%s", params.input_suffix.c_str());
-                    }
-
-                    LOG("buffer: '%s'\n", buffer.c_str());
-
-                    const size_t original_size = embd_inp.size();
-
-                    // instruct mode: insert instruction prefix
-                    if (params.instruct && !is_antiprompt) {
-                        LOG("inserting instruction prefix\n");
-                        n_consumed = embd_inp.size();
-                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
-                    }
-                    if (params.escape) {
-                        process_escapes(buffer);
-                    }
-
-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
-
-                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
-
-                    // instruct mode: insert response suffix
-                    if (params.instruct) {
-                        LOG("inserting instruction suffix\n");
-                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                    }
-
-                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
-                        const llama_token token = embd_inp[i];
-                        output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
-                    }
-
-                    n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
-                } else {
-                    LOG("empty line, passing control back\n");
-                }
-
-                input_echo = false; // do not echo this again
-            }
-
-            if (n_past > 0) {
-                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
-                }
-                is_interacting = false;
-            }
-        }
-
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
-            LOG_TEE(" [end of text]\n");
-            break;
-        }
-
-        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
-        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
-            n_remain = params.n_predict;
-            is_interacting = true;
-        }
-    }
-
-    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
-        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
-    }
-
-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
-
-    if (ctx_guidance) { llama_free(ctx_guidance); }
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_sampling_free(ctx_sampling);
-    llama_backend_free();
-
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
-    return 0;
-}
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 2cbc9e1fa..7fafc9e0b 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -94,8 +94,8 @@ int main(int argc, char ** argv) {
     ctx_params.seed    = seed;
     ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
     ctx_params.n_batch = 512;
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_threads       = params.n_threads[0];
+    ctx_params.n_threads_batch = params.n_threads_batch[0] == -1 ? params.n_threads[0] : params.n_threads_batch[0];
 
     GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d2a8e541d..ecaf36d27 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2409,7 +2409,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 invalid_param = true;
                 break;
             }
-            params.n_threads = std::stoi(argv[i]);
+            params.n_threads[0] = std::stoi(argv[i]);
         } else if (arg == "--grp-attn-n" || arg == "-gan") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2429,7 +2429,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 invalid_param = true;
                 break;
             }
-            params.n_threads_batch = std::stoi(argv[i]);
+            params.n_threads_batch[0] = std::stoi(argv[i]);
         } else if (arg == "--threads-http") {
             if (++i >= argc) {
                 invalid_param = true;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index e991b8846..e447c9949 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -71,7 +71,7 @@ int main(int argc, char ** argv) {
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
+    if (params.n_threads_draft.size() > 0) {
         params.n_threads = params.n_threads_draft;
     }
     params.n_threads_batch = params.n_threads_batch_draft;

From d2de181d9515f876d7b5462f685d7a67a5208908 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 19 Mar 2024 13:42:49 -0500
Subject: [PATCH 34/35] Port transactions from mpi-speculative, fix incorrect
 seq_id syncing (not tested)

---
 ggml-mpi.cpp | 137 +++++++++++++++++++++++++++++++++++---------------
 ggml-mpi.h   |  20 ++++++++
 llama.cpp    | 138 ++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 248 insertions(+), 47 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 7fb4e752c..9fa3e6717 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -6,8 +6,8 @@
 
 #include <mpi.h>
 
-#include <stdio.h>
-#include <stdlib.h>
+#include <cstdio>
+#include <cstdlib>
 #include <vector>
 
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -24,6 +24,8 @@ struct ggml_mpi_context {
     MPI_Comm comm;
     int layer_start;
     int layer_end;
+    MPI_Status status;
+
     struct ggml_tensor *inp0;
     std::string name;
     struct ggml_backend * wrapped_backend;
@@ -31,6 +33,8 @@ struct ggml_mpi_context {
     ggml_backend_sched_t scheduler;
     bool remote;
     void* send_buffer;
+    int trans_id;
+    int recv_trans_id;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -122,12 +126,43 @@ void ggml_mpi_sync_pipelined(
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
         GGML_ASSERT(ctx_mpi->send_buffer != nullptr);
+        GGML_ASSERT(val != nullptr);
+        GGML_ASSERT(count < 128*1024*1024);
+
         const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
         GGML_ASSERT(retval == MPI_SUCCESS);
 
     }
 }
 
+void ggml_mpi_barrier(struct ggml_mpi_context * ctx_mpi) {
+    MPI_Barrier(ctx_mpi->comm);
+}
+
+void ggml_mpi_probe(struct ggml_mpi_context * ctx_mpi, int src, int tag) {
+    MPI_Probe((src >= 0) ? src : MPI_ANY_SOURCE, (tag >= 0) ? tag : MPI_ANY_TAG, ctx_mpi->comm, &(ctx_mpi->status));
+}
+
+int ggml_mpi_iprobe(struct ggml_mpi_context * ctx_mpi, int src, int tag) {
+    if(ctx_mpi->comm == MPI_COMM_NULL) {
+        return 0;
+    }
+
+    int ret;
+    MPI_Iprobe((src >= 0) ? src : MPI_ANY_SOURCE, (tag >= 0) ? tag : MPI_ANY_TAG, ctx_mpi->comm, &ret, &(ctx_mpi->status));
+    return ret;
+}
+
+int ggml_mpi_status_tag(struct ggml_mpi_context * ctx_mpi) {
+    return ctx_mpi->status.MPI_TAG;
+}
+
+int ggml_mpi_status_count_int32(struct ggml_mpi_context * ctx_mpi) {
+    int32_t count;
+    MPI_Get_count(&ctx_mpi->status, MPI_INT32_T, &count);
+    return count;
+}
+
 void ggml_mpi_eval_init(
         struct ggml_mpi_context *   ctx_mpi,
                 int32_t         *   n_tokens,
@@ -142,8 +177,15 @@ void ggml_mpi_eval_init(
         return;
     }
 
-
+    int32_t old_n_tokens = *n_tokens;
     ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, GGML_MPI_N_TOKENS);
+
+    if (old_n_tokens != *n_tokens) {
+        *pos = static_cast<int32_t *>(realloc(*pos, *n_tokens * sizeof(int32_t)));
+        *n_seq_ids = static_cast<int32_t *>(realloc(*n_seq_ids, *n_tokens * sizeof(int32_t)));
+        *logits = static_cast<int8_t *>(realloc(*logits, *n_tokens * sizeof(int32_t)));
+    }
+
     int8_t* temp_logits = (int8_t*) calloc(*n_tokens, sizeof(int8_t));
 
     if (ctx_mpi->rank == 0 && *logits != nullptr) {
@@ -183,49 +225,51 @@ void ggml_mpi_eval_init(
     // pre-allocated for the largest possible sizes, even on worker nodes.
 
     GGML_ASSERT(n_seq_ids != nullptr);
+    GGML_ASSERT(*n_seq_ids != nullptr);
+
     GGML_ASSERT(n_tokens != nullptr);
 
 
     // FIXME Syncing n_seq_ids causes MPI to throw an invalid buffer error in Bsend
-//    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, GGML_MPI_N_SEQ_IDS);
+    ggml_mpi_sync_pipelined(ctx_mpi, *n_seq_ids, *n_tokens, MPI_INT32_T, GGML_MPI_N_SEQ_IDS);
 
     // We need to know the total number of sequence
     // ids, so we count them all up
-//    int32_t total_n_seq_ids = 0;
-//    for (int32_t i = 0; i < *n_tokens; i++) {
-//        total_n_seq_ids += (*n_seq_ids)[i];
-//    }
-//
-//    // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
-//    // for transit
-//    int32_t * flattened_seq_ids = static_cast<int32_t *>(calloc(total_n_seq_ids, sizeof(int32_t)));
-//
-//    int32_t current_index = 0;
-//
-//    // Only rank 0 needs to flatten since the others don't have the real seq_id
-//    if (ctx_mpi->rank == 0) {
-//        for (int32_t i = 0; i < *n_tokens; i++) {
-//            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
-//                flattened_seq_ids[current_index] = (*seq_id)[i][j];
-//                current_index++;
-//            }
-//        }
-//    }
-//
-//
-//
-//    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, GGML_MPI_POS);
-//    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, GGML_MPI_SEQ_IDS);
-//
-//    current_index = 0;
-//    for (int32_t i = 0; i < *n_tokens; i++) {
-//        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
-//            (*seq_id)[i][j] = flattened_seq_ids[current_index];
-//            current_index++;
-//        }
-//
-//    }
-//    free(flattened_seq_ids);
+    int32_t total_n_seq_ids = 0;
+    for (int32_t i = 0; i < *n_tokens; i++) {
+        total_n_seq_ids += (*n_seq_ids)[i];
+    }
+
+    // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
+    // for transit
+    int32_t * flattened_seq_ids = static_cast<int32_t *>(calloc(total_n_seq_ids, sizeof(int32_t)));
+
+    int32_t current_index = 0;
+
+    // Only rank 0 needs to flatten since the others don't have the real seq_id
+    if (ctx_mpi->rank == 0) {
+        for (int32_t i = 0; i < *n_tokens; i++) {
+            for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
+                flattened_seq_ids[current_index] = (*seq_id)[i][j];
+                current_index++;
+            }
+        }
+    }
+
+
+
+    ggml_mpi_sync_pipelined(ctx_mpi, *pos, *n_tokens, MPI_INT32_T, GGML_MPI_POS);
+    ggml_mpi_sync_pipelined(ctx_mpi, flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, GGML_MPI_SEQ_IDS);
+
+    current_index = 0;
+    for (int32_t i = 0; i < *n_tokens; i++) {
+        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
+            (*seq_id)[i][j] = flattened_seq_ids[current_index];
+            current_index++;
+        }
+
+    }
+    free(flattened_seq_ids);
 }
 
 
@@ -236,6 +280,19 @@ void ggml_mpi_sync_int(
     MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 }
 
+void ggml_mpi_sync_ints_pipelined(
+        struct ggml_mpi_context * ctx_mpi,
+        int32_t * vals,
+        int count,
+        int tag
+) {
+    ggml_mpi_sync_pipelined(ctx_mpi, vals, count, MPI_INT32_T, tag);
+    int old_trans = ctx_mpi->trans_id;
+    ggml_mpi_sync_pipelined(ctx_mpi, &ctx_mpi->trans_id, 1, MPI_INT32_T, GGML_MPI_TRANS_ID);
+    ctx_mpi->recv_trans_id = ctx_mpi->trans_id;
+    ctx_mpi->trans_id = old_trans;
+}
+
 static void ggml_mpi_tensor_send(const struct ggml_tensor * t, const void* data, int mpi_rank_dst, MPI_Comm comm) {
     MPI_Datatype mpi_type;
 
@@ -549,6 +606,8 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t
         }
     }
 
+    // TODO exploding memory usage cause we replace the buffer with the wrapped buffer,
+    //  but don't free the contexts, and then create new ones when we re-wrap
 
 
     if (!ctx->remote) {
diff --git a/ggml-mpi.h b/ggml-mpi.h
index fe8358f2d..e7880b704 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -100,6 +100,26 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer_type(
 GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_t buf);
 
 
+void ggml_mpi_sync_ints_pipelined(
+        struct ggml_mpi_context * ctx_mpi,
+        int32_t * vals,
+        int count,
+        int tag
+);
+
+void ggml_mpi_sync_ints_pipelined_back(
+        struct ggml_mpi_context * ctx_mpi,
+        int32_t * vals,
+        int count,
+        int tag
+);
+// clear = 1, rm = 2, cp = 3, keep = 4, seq_shift = 5
+void ggml_mpi_probe(struct ggml_mpi_context * ctx_mpi, int src, int tag);
+int ggml_mpi_status_tag(struct ggml_mpi_context * ctx_mpi);
+
+int ggml_mpi_iprobe(struct ggml_mpi_context * ctx_mpi, int src, int tag);
+int ggml_mpi_status_count_int32(struct ggml_mpi_context * ctx_mpi);
+
 /**
  * Create a new context by splitting the given context's
  * communicator, creating a "sub-communicator." This is a collective
diff --git a/llama.cpp b/llama.cpp
index d20c17677..5d7d10318 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9064,11 +9064,58 @@ static void llama_graph_compute(
 //
 static int llama_decode_internal(
          llama_context & lctx,
-           llama_batch   batch_all) { // TODO: rename back to batch
+           llama_batch & batch_all) { // TODO: rename back to batch
 
 
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0 && ggml_mpi_size(lctx.model.ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_DECODE;
+        ggml_mpi_sync_ints_pipelined(lctx.model.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+//    ggml_mpi_sync_ints_pipelined(lctx.model.ctx_mpi, &batch_all.batch_id, 1, GGML_MPI_BATCH_ID);
+    int old_tokens = batch_all.n_tokens;
+
+    ggml_mpi_sync_ints_pipelined(lctx.model.ctx_mpi, &batch_all.n_tokens, 1, GGML_MPI_N_TOKENS);
+
+    ggml_mpi_sync_ints_pipelined(lctx.model.ctx_mpi, reinterpret_cast<int32_t *>(&lctx.cparams.n_seq_max), 1, GGML_MPI_MAX_N_SEQ);
+    if (ggml_mpi_rank(lctx.model.ctx_mpi) > 0) {
+        int new_n_tokens = batch_all.n_tokens;
+        llama_batch_free(batch_all);
+        batch_all = llama_batch_init(new_n_tokens, 0, (int32_t)lctx.cparams.n_seq_max);
+    }
+#endif
+
     uint32_t n_tokens_all = batch_all.n_tokens;
 
+    std::vector<llama_pos> pos;
+    std::vector<int32_t>                   n_seq_id;
+    std::vector<llama_seq_id *>            seq_id_arr;
+    std::vector<std::vector<llama_seq_id>> seq_id;
+
+    if (batch_all.pos == nullptr) {
+        pos.resize(n_tokens_all);
+        for (uint32_t i = 0; i < n_tokens_all; i++) {
+            pos[i] = batch_all.all_pos_0 + i*batch_all.all_pos_1;
+        }
+
+        batch_all.pos = pos.data();
+    }
+
+    if (batch_all.seq_id == nullptr) {
+        n_seq_id.resize(n_tokens_all);
+        seq_id.resize(n_tokens_all);
+        seq_id_arr.resize(n_tokens_all);
+        for (uint32_t i = 0; i < n_tokens_all; i++) {
+            n_seq_id[i] = 1;
+            seq_id[i].resize(lctx.cparams.n_seq_max);
+            seq_id[i][0] = batch_all.all_seq_id;
+            seq_id_arr[i] = seq_id[i].data();
+        }
+
+        batch_all.n_seq_id = n_seq_id.data();
+        batch_all.seq_id = seq_id_arr.data();
+    }
+
 #ifdef GGML_USE_MPI
     ggml_mpi_eval_init(lctx.model.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
     n_tokens_all = batch_all.n_tokens;
@@ -9114,10 +9161,6 @@ static int llama_decode_internal(
 
     const auto n_ubatch = cparams.n_ubatch;
 
-    std::vector<llama_pos> pos;
-    std::vector<int32_t>                   n_seq_id;
-    std::vector<llama_seq_id *>            seq_id_arr;
-    std::vector<std::vector<llama_seq_id>> seq_id;
 
     for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
         uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
@@ -14344,6 +14387,87 @@ void llama_batch_free(struct llama_batch batch) {
     batch.logits = nullptr;
 }
 
+#ifdef GGML_USE_MPI
+
+int llama_process_mpi_transaction(
+        struct llama_context * ctx,
+        struct llama_batch & batch,
+        int tag) {
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1) {
+//        printf("\nBeginning transaction type %d\n", tag);
+//    }
+
+    switch (tag) {
+        case GGML_MPI_DECODE:
+//            llama_batch_free(batch);
+            return llama_decode_internal(*ctx, batch);
+            break;
+        case GGML_MPI_KV_CLEAR:
+            llama_kv_cache_clear(ctx);
+            break;
+        case GGML_MPI_KV_SEQ_RM:
+            llama_kv_cache_seq_rm(ctx, 1, -1, -1);
+            break;
+        case GGML_MPI_KV_SEQ_CP:
+            llama_kv_cache_seq_cp(ctx, 0, 0, 0, 0);
+            break;
+//        case GGML_MPI_KV_SEQ_CP_BACK:
+//            llama_kv_cache_seq_cp_back(ctx, 0, 0, 0, 0);
+//            break;
+//        case GGML_MPI_KV_SEQ_KEEP:
+//            llama_kv_cache_seq_keep(ctx, 0);
+//            break;
+//        case GGML_MPI_KV_SEQ_SHIFT:
+//            llama_kv_cache_seq_shift(ctx, 0, 0, 0, 0);
+//            break;
+        default:
+            printf("Unknown operation, exiting\n");
+            exit(1);
+            break;
+    }
+    return 0;
+}
+
+int llama_process_mpi_worker(
+        struct llama_context * ctx,
+        struct llama_batch & batch) {
+    ggml_mpi_probe(ctx->model.ctx_mpi, -1, -1);
+    int tag = ggml_mpi_status_tag(ctx->model.ctx_mpi);
+    int32_t count;
+    int32_t trans_type;
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1) {
+//        printf("\nReceived command %d\n", tag);
+//    }
+    switch (tag) {
+        case GGML_MPI_BEGIN_TRANSACTION:
+
+            ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, &trans_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+            return llama_process_mpi_transaction(ctx, batch, trans_type);
+            break;
+        case GGML_MPI_SHUTDOWN:
+            llama_free(ctx);
+            llama_backend_free();
+            exit(0);
+            break;
+        case GGML_MPI_CANCEL_RUN:
+//            count = ggml_mpi_status_count_int32(ctx->model.ctx_mpi);
+////            printf("Received cancel run\n");
+//            {
+//                std::vector<int32_t> canceled(count, -1);
+//                llama_cancel_run(ctx, canceled.data(), canceled.size());
+//
+//            }
+//            break;
+        default:
+            printf("Unknown operation, exiting\n");
+            exit(1);
+            break;
+    }
+    return 0;
+}
+
+#endif
+
 int32_t llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch) {
@@ -14351,9 +14475,7 @@ int32_t llama_decode(
 #ifdef GGML_USE_MPI
     if (ggml_mpi_rank(ctx->model.ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        const int n_ctx = llama_n_ctx(ctx);
-        std::vector<llama_token> tmp(n_ctx, llama_token_bos(&ctx->model));
-        while (llama_decode_internal(*ctx, batch) >= 0){};
+        while (llama_process_mpi_worker(ctx, batch) >= 0){};
         llama_backend_free();
         exit(1);
     }

From 9419190533f188be8a165519ee9a622d858381a9 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 19 Mar 2024 15:01:39 -0500
Subject: [PATCH 35/35] Pipeline KV operations

---
 ggml-mpi.cpp |   2 +-
 ggml-mpi.h   |   4 +-
 llama.cpp    | 102 +++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 9fa3e6717..f8c87f2d6 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -126,7 +126,7 @@ void ggml_mpi_sync_pipelined(
     }
     if(ctx_mpi->rank < ctx_mpi->size - 1) {
         GGML_ASSERT(ctx_mpi->send_buffer != nullptr);
-        GGML_ASSERT(val != nullptr);
+        GGML_ASSERT(val != nullptr || count == 0);
         GGML_ASSERT(count < 128*1024*1024);
 
         const int retval = MPI_Bsend(val, count, datatype, ggml_mpi_next_node(ctx_mpi), tag, ctx_mpi->comm);
diff --git a/ggml-mpi.h b/ggml-mpi.h
index e7880b704..d988a81e4 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -22,7 +22,7 @@ extern "C" {
 
 #define GGML_MPI_KV_SEQ_KEEP 4
 
-#define GGML_MPI_KV_SEQ_SHIFT 5
+#define GGML_MPI_KV_SEQ_ADD 5
 
 #define GGML_MPI_SHUTDOWN 6
 
@@ -54,6 +54,8 @@ extern "C" {
 
 #define GGML_MPI_BATCH_LOGITS 20
 
+#define GGML_MPI_KV_SEQ_DIV 21
+
 
 
 /**
diff --git a/llama.cpp b/llama.cpp
index 5d7d10318..a6392237b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13845,14 +13845,56 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
 }
 
 void llama_kv_cache_clear(struct llama_context * ctx) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) == 0 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_CLEAR;
+        ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+    ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, nullptr, 0, GGML_MPI_KV_CLEAR);
+#endif
     llama_kv_cache_clear(ctx->kv_self);
 }
 
 bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) == 0 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_SEQ_RM;
+        ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+    int32_t vals[3] = {seq_id, p0, p1};
+    ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, vals, 3, GGML_MPI_KV_SEQ_RM);
+    seq_id = vals[0];
+    p0 = vals[1];
+    p1 = vals[2];
+//    if (ggml_mpi_rank(ctx->model.ctx_mpi) == ggml_mpi_size(ctx->model.ctx_mpi) - 1 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+//        printf("\nRemoving sequence %d from %d to %d\n", seq_id, p0, p1);
+//    }
+#endif
     return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
 
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) == 0 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_SEQ_CP;
+        ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+
+    int32_t vals[4] = {seq_id_src, seq_id_dst, p0, p1};
+    ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_CP);
+//    if(ggml_mpi_recv_trans_id(ctx->model.ctx_mpi) < ggml_mpi_trans_id(ctx->model.ctx_mpi)) {
+////        return;
+//    }
+//    ggml_mpi_inc_trans_id(ctx->model.ctx_mpi);
+    seq_id_src = vals[0];
+    seq_id_dst = vals[1];
+    p0 = vals[2];
+    p1 = vals[3];
+//    if (ggml_mpi_rank(ctx->model.ctx_mpi) == ggml_mpi_size(ctx->model.ctx_mpi) - 1 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+//        printf("\nCopying sequence %d to sequence %d from %d to %d\n", seq_id_src, seq_id_dst, p0, p1);
+//    }
+#endif
+
     if (seq_id_src == seq_id_dst) {
         return;
     }
@@ -13860,10 +13902,35 @@ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src,
 }
 
 void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) == 0 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_SEQ_KEEP;
+        ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+    int32_t vals[1] = {seq_id};
+    ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, vals, 1, GGML_MPI_KV_SEQ_KEEP);
+    seq_id = vals[0];
+#endif
     llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
 }
 
 void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) == 0 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_SEQ_ADD;
+        ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+    int32_t vals[4] = {seq_id, p0, p1, delta};
+    ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_ADD);
+    seq_id = vals[0];
+    p0 = vals[1];
+    p1 = vals[2];
+    delta = vals[3];
+//    if (ggml_mpi_rank(ctx->model.ctx_mpi) == ggml_mpi_size(ctx->model.ctx_mpi) - 1 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+//        printf("\nRemoving sequence %d from %d to %d\n", seq_id, p0, p1);
+//    }
+#endif
+
     if (delta == 0) {
         return;
     }
@@ -13872,6 +13939,22 @@ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, lla
 }
 
 void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+#ifdef GGML_USE_MPI
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) == 0 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+        int transaction_type = GGML_MPI_KV_SEQ_DIV;
+        ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, &transaction_type, 1, GGML_MPI_BEGIN_TRANSACTION);
+    }
+    int32_t vals[4] = {seq_id, p0, p1, d};
+    ggml_mpi_sync_ints_pipelined(ctx->model.ctx_mpi, vals, 4, GGML_MPI_KV_SEQ_DIV);
+    seq_id = vals[0];
+    p0 = vals[1];
+    p1 = vals[2];
+    d = vals[3];
+//    if (ggml_mpi_rank(ctx->model.ctx_mpi) == ggml_mpi_size(ctx->model.ctx_mpi) - 1 && ggml_mpi_size(ctx->model.ctx_mpi) > 1) {
+//        printf("\nRemoving sequence %d from %d to %d\n", seq_id, p0, p1);
+//    }
+#endif
+
     if (d == 1) {
         return;
     }
@@ -14393,7 +14476,7 @@ int llama_process_mpi_transaction(
         struct llama_context * ctx,
         struct llama_batch & batch,
         int tag) {
-//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1) {
+//    if (ggml_mpi_rank(ctx->model.ctx_mpi) == ggml_mpi_size(ctx->model.ctx_mpi) - 1) {
 //        printf("\nBeginning transaction type %d\n", tag);
 //    }
 
@@ -14414,12 +14497,15 @@ int llama_process_mpi_transaction(
 //        case GGML_MPI_KV_SEQ_CP_BACK:
 //            llama_kv_cache_seq_cp_back(ctx, 0, 0, 0, 0);
 //            break;
-//        case GGML_MPI_KV_SEQ_KEEP:
-//            llama_kv_cache_seq_keep(ctx, 0);
-//            break;
-//        case GGML_MPI_KV_SEQ_SHIFT:
-//            llama_kv_cache_seq_shift(ctx, 0, 0, 0, 0);
-//            break;
+        case GGML_MPI_KV_SEQ_KEEP:
+            llama_kv_cache_seq_keep(ctx, 0);
+            break;
+        case GGML_MPI_KV_SEQ_ADD:
+            llama_kv_cache_seq_add(ctx, 0, 0, 0, 0);
+            break;
+        case GGML_MPI_KV_SEQ_DIV:
+            llama_kv_cache_seq_div(ctx, 0, 0, 0, 0);
+            break;
         default:
             printf("Unknown operation, exiting\n");
             exit(1);
@@ -14435,7 +14521,7 @@ int llama_process_mpi_worker(
     int tag = ggml_mpi_status_tag(ctx->model.ctx_mpi);
     int32_t count;
     int32_t trans_type;
-//    if (ggml_mpi_rank(ctx->ctx_mpi) == ggml_mpi_size(ctx->ctx_mpi) - 1) {
+//    if (ggml_mpi_rank(ctx->model.ctx_mpi) == ggml_mpi_size(ctx->model.ctx_mpi) - 1) {
 //        printf("\nReceived command %d\n", tag);
 //    }
     switch (tag) {