use "ROCm" instead of "CUDA"

This commit is contained in:
Henri Vasserman 2023-08-22 19:24:35 +03:00
parent 391dd9a0e2
commit 5d3e7b25e0
No known key found for this signature in database
GPG key ID: 2995FC0F58B1A986
5 changed files with 13 additions and 3 deletions

View file

@ -601,7 +601,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
#if defined(GGML_USE_HIPBLAS)
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q HIP kernels instead of hipBLAS. TEMP!!!\n" );
#else
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
#endif
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
#endif #endif

View file

@ -504,7 +504,7 @@ struct test {
static std::string get_backend() { static std::string get_backend() {
if (cuda) { if (cuda) {
return "CUDA"; return GGML_CUDA_NAME;
} }
if (opencl) { if (opencl) {
return "OpenCL"; return "OpenCL";

View file

@ -5025,7 +5025,7 @@ void ggml_init_cublas() {
CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count); fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); CUDA_CHECK(cudaGetDeviceProperties(&prop, id));

View file

@ -2,6 +2,12 @@
#include "ggml.h" #include "ggml.h"
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#else
#define GGML_CUDA_NAME "CUDA"
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif

View file

@ -1478,7 +1478,7 @@ static void llama_model_load_internal(
(void) main_gpu; (void) main_gpu;
(void) mul_mat_q; (void) mul_mat_q;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_main_device(main_gpu);
ggml_cuda_set_mul_mat_q(mul_mat_q); ggml_cuda_set_mul_mat_q(mul_mat_q);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU