diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c3a82b7a..e766833c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -345,10 +345,16 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) if (LLAMA_STATIC) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + if (WIN32) + # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() else() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) # 52 == lowest CUDA 12 standard diff --git a/common/common.cpp b/common/common.cpp index a8abf04eb..1bd2bc1ec 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -603,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CLBLAS_SYCL +#ifndef GGML_USE_CUBLAS_SYCL fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CLBLAS_SYCL +#endif // GGML_USE_CUBLAS_SYCL } else if (arg == "--split-mode" || arg == "-sm") { if (++i >= argc) { invalid_param = true; @@ -622,9 +622,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifndef GGML_USE_CLBLAS_SYCL +#ifndef GGML_USE_CUBLAS_SYCL fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CLBLAS_SYCL +#endif // GGML_USE_CUBLAS_SYCL } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { @@ -651,7 +651,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CLBLAS_SYCL +#endif // GGML_USE_CUBLAS_SYCL } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) params.mul_mat_q = false; @@ -1020,16 +1020,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); #endif // LLAMA_SUPPORTS_GPU_OFFLOAD -#ifdef GGML_USE_CLBLAS - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // GGML_USE_CLBLAS -#ifdef GGML_USE_SYCL - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " GGML_SYCL_NAME " instead of custom mul_mat_q " GGML_SYCL_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // GGML_USE_SYCL printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n");