diff --git a/common/common.cpp b/common/common.cpp
index 1623ba21f..2c42b0cd4 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -601,7 +601,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
     fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
     fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+#if defined(GGML_USE_HIPBLAS)
+    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q HIP kernels instead of hipBLAS. TEMP!!!\n" );
+#else
     fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
+#endif
     fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
     fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
 #endif
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 36057bfca..99edf8688 100755
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -504,7 +504,7 @@ struct test {
 
     static std::string get_backend() {
         if (cuda) {
-            return "CUDA";
+            return GGML_CUDA_NAME;
         }
         if (opencl) {
             return "OpenCL";
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index dd82011ae..7cd4a517b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5025,7 +5025,7 @@ void ggml_init_cublas() {
         CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
         GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
         int64_t total_vram = 0;
-        fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
         for (int id = 0; id < g_device_count; ++id) {
             cudaDeviceProp prop;
             CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
diff --git a/ggml-cuda.h b/ggml-cuda.h
index f66bb1678..17e8d471d 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -2,6 +2,12 @@
 
 #include "ggml.h"
 
+#ifdef GGML_USE_HIPBLAS
+#define GGML_CUDA_NAME "ROCm"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/llama.cpp b/llama.cpp
index 8b151dc84..42454d64b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1478,7 +1478,7 @@ static void llama_model_load_internal(
     (void) main_gpu;
     (void) mul_mat_q;
 #if defined(GGML_USE_CUBLAS)
-    LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
+    LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
     ggml_cuda_set_main_device(main_gpu);
     ggml_cuda_set_mul_mat_q(mul_mat_q);
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU