From 132d25b8a62ea084447e0014a0112c1b371fb3f8 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <cebtenzzre@gmail.com>
Date: Sun, 5 Nov 2023 10:08:57 -0500
Subject: [PATCH 1/4] cuda : fix disabling device with --tensor-split 1,0
 (#3951)

Co-authored-by: slaren <slarengh@gmail.com>
---
 ggml-cuda.cu | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index dc14f2f5d..9f873035a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
     int64_t  row_low[GGML_CUDA_MAX_DEVICES];
     int64_t row_high[GGML_CUDA_MAX_DEVICES];
 
+    int used_devices = 0;
+
     for (int64_t id = 0; id < g_device_count; ++id) {
         // by default, use all rows
         row_low[id]  = 0;
@@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
             continue;
         }
 
+        used_devices++;
+
         const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
         const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
 
@@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
 
     // if multiple devices are used they need to wait for the main device
     // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && g_device_count > 1) {
+    if (split && used_devices > 1) {
         CUDA_CHECK(ggml_cuda_set_device(g_main_device));
         CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
     }
 
-    const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
     for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
         const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
     }
 
     for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
         CUDA_CHECK(ggml_cuda_set_device(id));
 
         // free buffers again when done
@@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
 
         CUDA_CHECK(ggml_cuda_set_device(g_main_device));
         for (int64_t id = 0; id < g_device_count; ++id) {
+            if (row_low[id] == row_high[id]) {
+                continue;
+            }
             for (int64_t is = 0; is < is_max; ++is) {
                 CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
             }
@@ -7400,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
 
 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU) &&
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
         (src1->backend == GGML_BACKEND_GPU) &&
         ( dst->backend == GGML_BACKEND_GPU);
 

From bb60fd0bf6bb270744d86dd45b3a95af01b7de45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A1i=20Ho=C3=A0ng=20T=C3=A2m?=
 <75922889+RoyalHeart@users.noreply.github.com>
Date: Sun, 5 Nov 2023 23:15:27 +0700
Subject: [PATCH 2/4] server : fix typo for --alias shortcut from -m to -a
 (#3958)

---
 examples/server/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 715007735..089ebe2d1 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,7 +7,7 @@ Command line options:
 -   `--threads N`, `-t N`: Set the number of threads to use during generation.
 -   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
--   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+-   `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.

From d9ccce2e339ca0396560d18b8637f2c848d72a08 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Sun, 5 Nov 2023 10:06:06 -0700
Subject: [PATCH 3/4] Allow common process_escapes to handle \x sequences
 (#3928)

* Allow common process_escapes to handle \x sequences

* Fix edge case when second hex digit is NUL
---
 common/common.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index 20cc4a081..37e3ace8a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -90,6 +90,19 @@ void process_escapes(std::string& input) {
                 case '\'': input[output_idx++] = '\''; break;
                 case '\"': input[output_idx++] = '\"'; break;
                 case '\\': input[output_idx++] = '\\'; break;
+                case 'x':
+                    // Handle \x12, etc
+                    if (input_idx + 2 < input_len) {
+                        const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
+                        char *err_p = nullptr;
+                        const long val = std::strtol(x, &err_p, 16);
+                        if (err_p == x + 2) {
+                            input_idx += 2;
+                            input[output_idx++] = char(val);
+                            break;
+                        }
+                        // Intentionally fall through to default.
+                    }
                 default:   input[output_idx++] = '\\';
                            input[output_idx++] = input[input_idx]; break;
             }

From 2833a6f63c1b87c7f4ac574bcf7a15a2f3bf3ede Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sun, 5 Nov 2023 18:45:16 +0100
Subject: [PATCH 4/4] ggml-cuda : fix f16 mul mat (#3961)

* ggml-cuda : fix f16 mul mat

ggml-ci

* silence common.cpp warning (bonus)
---
 common/common.cpp | 2 +-
 ggml-cuda.cu      | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 37e3ace8a..6a7114200 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -101,8 +101,8 @@ void process_escapes(std::string& input) {
                             input[output_idx++] = char(val);
                             break;
                         }
-                        // Intentionally fall through to default.
                     }
+                    // fall through
                 default:   input[output_idx++] = '\\';
                            input[output_idx++] = input[input_idx]; break;
             }
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9f873035a..2d9ffffbf 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7414,6 +7414,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         (src1->backend == GGML_BACKEND_GPU) &&
         ( dst->backend == GGML_BACKEND_GPU);
 
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
     int64_t min_compute_capability = INT_MAX;
     for (int64_t id = 0; id < g_device_count; ++id) {
         if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@@ -7435,13 +7437,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
         // KQV single-batch
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
         // KQ + KQV multi-batch
         ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
     } else if (src0->type == GGML_TYPE_F32) {