diff --git a/common/common.cpp b/common/common.cpp
index eacaee18e..47f9381cf 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -556,6 +556,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 #else
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
 #endif
+        } else if (arg == "--split-mode" || arg == "-sm") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string arg_next = argv[i];
+            if (arg_next == "none") {
+                params.split_mode = LLAMA_SPLIT_NONE;
+            } else if (arg_next == "layer") {
+                params.split_mode = LLAMA_SPLIT_LAYER;
+            } else if (arg_next == "row") {
+                params.split_mode = LLAMA_SPLIT_ROW;
+            } else {
+                invalid_param = true;
+                break;
+            }
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -895,14 +914,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        number of layers to store in VRAM\n");
     printf("  -ngld N, --n-gpu-layers-draft N\n");
     printf("                        number of layers to store in VRAM for the draft model\n");
+    printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
+    printf("                        how to split the model across multiple GPUs, one of:\n");
+    printf("                          - none: use one GPU only\n");
+    printf("                          - layer (default): split layers and KV across GPUs\n");
+    printf("                          - row: split rows across GPUs\n");
     printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-#ifdef GGML_USE_CUBLAS
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
-#endif // GGML_USE_CUBLAS
+    printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
+    printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
+    printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
 #endif
     printf("  --verbose-prompt      print prompt before generation\n");
     printf("  -dkvc, --dump-kv-cache\n");
@@ -1015,6 +1035,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
         mparams.n_gpu_layers = params.n_gpu_layers;
     }
     mparams.main_gpu        = params.main_gpu;
+    mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
diff --git a/common/common.h b/common/common.h
index 9659aa045..758722c11 100644
--- a/common/common.h
+++ b/common/common.h
@@ -59,6 +59,7 @@ struct gpt_params {
     float   p_split                         = 0.1f;  // speculative decoding split probability
     int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    llama_split_mode split_mode             = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
diff --git a/ggml-backend.c b/ggml-backend.c
index c03221200..cddd784b9 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1394,6 +1394,8 @@ ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched,
 }
 
 void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+    // FIXME: node_allocr is cleared when splitting the graph, so all user assignments are lost
+    // to avoid this, we need to clear node_allocr after compute rather than before split
     int backend_index = sched_backend_prio(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     node_allocr(node) = sched->tallocs[backend_index];
diff --git a/llama.cpp b/llama.cpp
index d615252fd..82db0a11d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3130,6 +3130,7 @@ static bool llm_load_tensors(
         llama_model_loader & ml,
         llama_model & model,
         int n_gpu_layers,
+        enum llama_split_mode split_mode,
         int main_gpu,
         const float * tensor_split,
         bool use_mlock,
@@ -3144,14 +3145,6 @@ static bool llm_load_tensors(
 
     size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
 
-    // TODO: user configurable
-    enum gpu_split_mode {
-        LLAMA_SPLIT_NONE,    // single GPU
-        LLAMA_SPLIT_LAYER,   // offload layers to different GPUs
-        LLAMA_SPLIT_ROW      // split matrix rows across GPUs
-    };
-
-    gpu_split_mode split_mode = LLAMA_SPLIT_LAYER;
     const int64_t n_layer     = hparams.n_layer;
     const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
 
@@ -3207,13 +3200,25 @@ static bool llm_load_tensors(
     } else
 #endif
     {
-        // offload layers
+        ggml_backend_buffer_type_t split_buft;
+        if (split_mode == LLAMA_SPLIT_ROW) {
+            split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
+        } else {
+            split_buft = llama_default_buffer_type_offload(main_gpu);
+        }
+        // repeating layers
         for (int64_t i = i_gpu_start; i < n_layer; ++i) {
-            model.buft_layer[i] = { llama_default_buffer_type_split(main_gpu, tensor_split), llama_default_buffer_type_offload(main_gpu) };
+            model.buft_layer[i] = {
+                split_buft,
+                llama_default_buffer_type_offload(main_gpu)
+            };
         }
         // output layer
         if (n_gpu_layers > n_layer) {
-            model.buft_output = { llama_default_buffer_type_split(main_gpu, tensor_split), llama_default_buffer_type_offload(main_gpu) };
+            model.buft_output = {
+                split_buft,
+                llama_default_buffer_type_offload(main_gpu)
+            };
         } else {
             model.buft_output = llama_default_buffer_type_cpu(true);
         }
@@ -3804,7 +3809,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
         }
 
         if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
+            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
         )) {
             return -2;
@@ -8964,6 +8969,7 @@ static int llama_apply_lora_from_file_internal(
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
         /*.n_gpu_layers                =*/ 0,
+        /*.split_mode                  =*/ LLAMA_SPLIT_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
diff --git a/llama.h b/llama.h
index 461d4604a..662c2c3be 100644
--- a/llama.h
+++ b/llama.h
@@ -115,6 +115,12 @@ extern "C" {
         LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
     };
 
+    enum llama_split_mode {
+        LLAMA_SPLIT_NONE    = 0, // single GPU
+        LLAMA_SPLIT_LAYER   = 1, // split layers and KV to different GPUs
+        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
+    };
+
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -177,8 +183,13 @@ extern "C" {
 
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
-        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
-        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+        // the GPU that is used for the model (LLAMA_SPLIT_NONE),
+        // for small tensors and intermediate results (LLAMA_SPLIT_ROW)
+        // ignored for LLAMA_SPLIT_LAYER
+        int32_t main_gpu;
+        // fraction of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+        const float * tensor_split;
 
         // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
         // If the provided progress_callback returns true, model loading continues.