From d41cef9326cddb895b3b44edd0287602b0de774a Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 8 Jan 2024 13:42:20 +0100
Subject: [PATCH] minor

---
 common/common.cpp          | 30 +++++++++++-------------------
 llama.h                    |  4 ++--
 tests/test-backend-ops.cpp | 14 +++++++-------
 3 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 47f9381cf..58155f7f5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -529,9 +529,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
             params.n_gpu_layers = std::stoi(argv[i]);
-#else
+#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@@ -540,9 +539,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
             params.n_gpu_layers_draft = std::stoi(argv[i]);
-#else
+#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@@ -551,11 +549,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifdef GGML_USE_CUBLAS
             params.main_gpu = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
-#endif
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS
         } else if (arg == "--split-mode" || arg == "-sm") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -580,15 +577,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifdef GGML_USE_CUBLAS
             std::string arg_next = argv[i];
 
             // split string by , and /
             const std::regex regex{R"([,/]+)"};
             std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
             std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
-
+            if (split_arg.size() >= LLAMA_MAX_DEVICES) {
+                invalid_param = true;
+                break;
+            }
             for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
                 if (i < split_arg.size()) {
                     params.tensor_split[i] = std::stof(split_arg[i]);
@@ -596,14 +594,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                     params.tensor_split[i] = 0.0f;
                 }
             }
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
-#endif // GGML_USE_CUBLAS
-        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
-#ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = false;
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
 #endif // GGML_USE_CUBLAS
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
diff --git a/llama.h b/llama.h
index 662c2c3be..605783b42 100644
--- a/llama.h
+++ b/llama.h
@@ -117,7 +117,7 @@ extern "C" {
 
     enum llama_split_mode {
         LLAMA_SPLIT_NONE    = 0, // single GPU
-        LLAMA_SPLIT_LAYER   = 1, // split layers and KV to different GPUs
+        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
         LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
     };
 
@@ -188,7 +188,7 @@ extern "C" {
         // for small tensors and intermediate results (LLAMA_SPLIT_ROW)
         // ignored for LLAMA_SPLIT_LAYER
         int32_t main_gpu;
-        // fraction of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+        // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
         const float * tensor_split;
 
         // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 1f95258a8..18eb4a881 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -474,17 +474,17 @@ struct test_case {
             printf("compare failed ");
         }
 
-        if (ud.ok && cmp_ok) {
-            printf("\033[1;32mOK\033[0m\n");
-        } else {
-            printf("\033[1;31mFAIL\033[0m\n");
-        }
-
         ggml_backend_buffer_free(buf);
 
         ggml_free(ctx);
 
-        return ud.ok;
+        if (ud.ok && cmp_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            return true;
+        }
+
+        printf("\033[1;31mFAIL\033[0m\n");
+        return false;
     }
 
     bool eval_perf(ggml_backend_t backend, const char * op_name) {