diff --git a/common/common.cpp b/common/common.cpp index 1623ba21f..2c42b0cd4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -601,7 +601,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); +#if defined(GGML_USE_HIPBLAS) + fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q HIP kernels instead of hipBLAS. TEMP!!!\n" ); +#else fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); +#endif fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); #endif diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 36057bfca..99edf8688 100755 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -504,7 +504,7 @@ struct test { static std::string get_backend() { if (cuda) { - return "CUDA"; + return GGML_CUDA_NAME; } if (opencl) { return "OpenCL"; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dd82011ae..7cd4a517b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5025,7 +5025,7 @@ void ggml_init_cublas() { CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; - fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count); + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); diff --git a/ggml-cuda.h b/ggml-cuda.h index f66bb1678..17e8d471d 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -2,6 +2,12 @@ #include "ggml.h" +#ifdef GGML_USE_HIPBLAS +#define GGML_CUDA_NAME "ROCm" +#else +#define GGML_CUDA_NAME "CUDA" +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/llama.cpp b/llama.cpp index 8b151dc84..42454d64b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1478,7 +1478,7 @@ static void llama_model_load_internal( (void) main_gpu; (void) mul_mat_q; #if defined(GGML_USE_CUBLAS) - LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); + LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU