minor

2024-01-08 13:42:20 +01:00 · 2024-01-08 13:42:20 +01:00 · d41cef9326
commit d41cef9326
parent 444b975edd
3 changed files with 20 additions and 28 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -529,9 +529,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params.n_gpu_layers = std::stoi(argv[i]);
-#else
+#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@ -540,9 +539,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params.n_gpu_layers_draft = std::stoi(argv[i]);
-#else
+#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@ -551,11 +549,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUBLAS
            params.main_gpu = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
-#endif
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS
        } else if (arg == "--split-mode" || arg == "-sm") {
            if (++i >= argc) {
                invalid_param = true;
@ -580,15 +577,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUBLAS
            std::string arg_next = argv[i];

            // split string by , and /
            const std::regex regex{R"([,/]+)"};
            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
            std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
-
+            if (split_arg.size() >= LLAMA_MAX_DEVICES) {
+                invalid_param = true;
+                break;
+            }
            for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
                if (i < split_arg.size()) {
                    params.tensor_split[i] = std::stof(split_arg[i]);
@ -596,14 +594,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                    params.tensor_split[i] = 0.0f;
                }
            }
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
-#endif // GGML_USE_CUBLAS
-        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
-#ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = false;
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
--- a/llama.h
+++ b/llama.h
@ -117,7 +117,7 @@ extern "C" {

    enum llama_split_mode {
        LLAMA_SPLIT_NONE    = 0, // single GPU
-        LLAMA_SPLIT_LAYER   = 1, // split layers and KV to different GPUs
+        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
    };

@ -188,7 +188,7 @@ extern "C" {
        // for small tensors and intermediate results (LLAMA_SPLIT_ROW)
        // ignored for LLAMA_SPLIT_LAYER
        int32_t main_gpu;
-        // fraction of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+        // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
        const float * tensor_split;

        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -474,17 +474,17 @@ struct test_case {
            printf("compare failed ");
        }

-        if (ud.ok && cmp_ok) {
-            printf("\033[1;32mOK\033[0m\n");
-        } else {
-            printf("\033[1;31mFAIL\033[0m\n");
-        }
-
        ggml_backend_buffer_free(buf);

        ggml_free(ctx);

-        return ud.ok;
+        if (ud.ok && cmp_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            return true;
+        }
+
+        printf("\033[1;31mFAIL\033[0m\n");
+        return false;
    }

    bool eval_perf(ggml_backend_t backend, const char * op_name) {