From 48fbf8ca1acfdcb73c00e996bfff66e548aec3aa Mon Sep 17 00:00:00 2001
From: Yui <dev@sleepyyui.com>
Date: Mon, 8 Apr 2024 10:08:14 +0200
Subject: [PATCH] add multi-gpu support for ngl calculations

---
 llama.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0ca439dcd..6f9b5d534 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4362,13 +4362,23 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
 
-static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu) {
+static int llm_determine_max_ngl(const llama_model_loader & ml, const llama_model & model, const int main_gpu, enum llama_split_mode split_mode) {
     const auto & hparams = model.hparams;
 
     // could become negative - use signed size_t
-    ssize_t available_gpu_memory = llama_get_available_device_memory(main_gpu);
+    ssize_t available_gpu_memory = 0;
     int n_layer = hparams.n_layer;
 
+    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
+        // veeery sketchy, there has to be a better way to do this
+        int deice_count = llama_get_device_count();
+        for (int i = 0; i < deice_count; ++i) {
+            available_gpu_memory += llama_get_available_device_memory(i);
+        }
+    } else {
+        available_gpu_memory = llama_get_available_device_memory(main_gpu);
+    }
+
     // "avoid a scenario where an application ooms because llama.cpp only left 5 MB of VRAM" - https://github.com/ggerganov/llama.cpp/pull/6502#discussion_r1555060962
     available_gpu_memory -= 50 * MiB;
 
@@ -4452,7 +4462,7 @@ static bool llm_load_tensors(
     auto & hparams = model.hparams;
 
     if (n_gpu_layers == -2) {
-        n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu);
+        n_gpu_layers = llm_determine_max_ngl(ml, model, main_gpu, split_mode);
         LLAMA_LOG_INFO("%s: automatically set n_gpu_layers to %d\n", __func__, n_gpu_layers);
     }