use "ROCm" instead of "CUDA"
This commit is contained in:
parent
391dd9a0e2
commit
5d3e7b25e0
5 changed files with 13 additions and 3 deletions
|
@ -601,7 +601,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
||||||
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
|
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q HIP kernels instead of hipBLAS. TEMP!!!\n" );
|
||||||
|
#else
|
||||||
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
||||||
|
#endif
|
||||||
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
||||||
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -504,7 +504,7 @@ struct test {
|
||||||
|
|
||||||
static std::string get_backend() {
|
static std::string get_backend() {
|
||||||
if (cuda) {
|
if (cuda) {
|
||||||
return "CUDA";
|
return GGML_CUDA_NAME;
|
||||||
}
|
}
|
||||||
if (opencl) {
|
if (opencl) {
|
||||||
return "OpenCL";
|
return "OpenCL";
|
||||||
|
|
|
@ -5025,7 +5025,7 @@ void ggml_init_cublas() {
|
||||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||||
|
|
|
@ -2,6 +2,12 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_HIPBLAS
|
||||||
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
|
#else
|
||||||
|
#define GGML_CUDA_NAME "CUDA"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1478,7 +1478,7 @@ static void llama_model_load_internal(
|
||||||
(void) main_gpu;
|
(void) main_gpu;
|
||||||
(void) mul_mat_q;
|
(void) mul_mat_q;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
||||||
ggml_cuda_set_main_device(main_gpu);
|
ggml_cuda_set_main_device(main_gpu);
|
||||||
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue