revert unrelated changed in cuda cmake
remove useless nommq fix typo of GGML_USE_CLBLAS_SYCL
This commit is contained in:
parent
67de350e79
commit
fb15de38ef
2 changed files with 12 additions and 16 deletions
|
@ -345,10 +345,16 @@ if (LLAMA_CUBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||||
|
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
|
if (WIN32)
|
||||||
|
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||||
|
else ()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
endif()
|
endif()
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
# 52 == lowest CUDA 12 standard
|
# 52 == lowest CUDA 12 standard
|
||||||
|
|
|
@ -603,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#ifndef GGML_USE_CLBLAS_SYCL
|
#ifndef GGML_USE_CUBLAS_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
||||||
#endif // GGML_USE_CLBLAS_SYCL
|
#endif // GGML_USE_CUBLAS_SYCL
|
||||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -622,9 +622,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CLBLAS_SYCL
|
#ifndef GGML_USE_CUBLAS_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
||||||
#endif // GGML_USE_CLBLAS_SYCL
|
#endif // GGML_USE_CUBLAS_SYCL
|
||||||
|
|
||||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -651,7 +651,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
||||||
#endif // GGML_USE_CLBLAS_SYCL
|
#endif // GGML_USE_CUBLAS_SYCL
|
||||||
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||||
params.mul_mat_q = false;
|
params.mul_mat_q = false;
|
||||||
|
@ -1020,16 +1020,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
#ifdef GGML_USE_CLBLAS
|
|
||||||
printf(" -nommq, --no-mul-mat-q\n");
|
|
||||||
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
|
||||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
|
||||||
#endif // GGML_USE_CLBLAS
|
|
||||||
#ifdef GGML_USE_SYCL
|
|
||||||
printf(" -nommq, --no-mul-mat-q\n");
|
|
||||||
printf(" use " GGML_SYCL_NAME " instead of custom mul_mat_q " GGML_SYCL_NAME " kernels.\n");
|
|
||||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
|
||||||
#endif // GGML_USE_SYCL
|
|
||||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||||
printf(" -gan N, --grp-attn-n N\n");
|
printf(" -gan N, --grp-attn-n N\n");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue