From 1e946c54a2047d1ebb0e33711f6e6a2970f43194 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 14 Dec 2023 19:41:53 +0100 Subject: [PATCH] cmake: enable UMA-compatible allocation when LLAMA_HIP_UMA=ON --- CMakeLists.txt | 4 ++++ README.md | 6 ++++-- ggml-cuda.cu | 7 ++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57b43c136..d5bd28812 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,6 +91,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) +option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) @@ -372,6 +373,9 @@ if (LLAMA_HIPBLAS) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + if (LLAMA_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) + endif() add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) if (BUILD_SHARED_LIBS) set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/README.md b/README.md index 696122d52..bf9583ea9 100644 --- a/README.md +++ b/README.md @@ -421,13 +421,15 @@ Building the program with BLAS support may lead to some performance improvements ```bash make LLAMA_HIPBLAS=1 ``` - - Using `CMake` for Linux. + - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build -- -j 16 ``` - - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS): + On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. + However, this hurts performance for non-integrated GPUs. + - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% mkdir build diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 88fbe7917..310fb08d7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -58,8 +58,13 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError -#define cudaMalloc(ptr, size) hipMallocManaged(ptr, size) +#ifdef GGML_HIP_UMA +#define cudaMalloc hipMallocManaged #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) +#else +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#endif #define cudaMemcpy hipMemcpy #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync