Enable VMM on rocm

This commit is contained in:
uvos 2025-01-23 22:33:53 +01:00
parent 580b619a07
commit 894b489ada
3 changed files with 58 additions and 19 deletions

View file

@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
[[noreturn]] [[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int id = -1; // in case cudaGetDevice fails int id = -1; // in case cudaGetDevice fails
cudaGetDevice(&id); (void)cudaGetDevice(&id);
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg); GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_CUDA_NO_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id; alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
} }
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_CUDA_NO_VMM)
info.devices[id].vmm = !!device_vmm; info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop; cudaDeviceProp prop;
@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}; };
// pool with virtual memory // pool with virtual memory
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_CUDA_NO_VMM)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool { struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
size_t pool_used = 0; size_t pool_used = 0;
size_t pool_size = 0; size_t pool_size = 0;
size_t granularity; size_t granularity;
#if defined(GGML_USE_HIP)
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
#endif
explicit ggml_cuda_pool_vmm(int device) : explicit ggml_cuda_pool_vmm(int device) :
device(device), device(device),
@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
~ggml_cuda_pool_vmm() { ~ggml_cuda_pool_vmm() {
if (pool_addr != 0) { if (pool_addr != 0) {
#if defined(GGML_USE_HIP)
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
}
#else
CU_CHECK(cuMemUnmap(pool_addr, pool_size)); CU_CHECK(cuMemUnmap(pool_addr, pool_size));
#endif
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE)); CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
} }
} }
@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
} }
// map at the end of the pool // map at the end of the pool
CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0)); CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
#if defined(GGML_USE_HIP)
mappings.push_back({start_ptr, reserve_size});
#endif
// the memory allocation handle is no longer needed after mapping // the memory allocation handle is no longer needed after mapping
CU_CHECK(cuMemRelease(handle)); CU_CHECK(cuMemRelease(handle));
@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access.location.id = device; access.location.id = device;
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1)); CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
// add to the pool // add to the pool
pool_size += reserve_size; pool_size += reserve_size;
@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(pool_addr != 0); GGML_ASSERT(pool_addr != 0);
void * ptr = (void *) (pool_addr + pool_used); void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
*actual_size = size; *actual_size = size;
pool_used += size; pool_used += size;
@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
pool_used -= size; pool_used -= size;
// all deallocations must be in reverse order of the allocations // all deallocations must be in reverse order of the allocations
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
} }
}; };
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_CUDA_NO_VMM)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) { std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_CUDA_NO_VMM)
if (ggml_cuda_info().devices[device].vmm) { if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
} }
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) #endif // !defined(GGML_CUDA_NO_VMM)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
} }
@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error
cudaGetLastError(); (void)cudaGetLastError();
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
return nullptr; return nullptr;
} }
@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
cudaError_t err = cudaMallocHost((void **) &ptr, size); cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error
cudaGetLastError(); (void)cudaGetLastError();
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err)); size / 1024.0 / 1024.0, cudaGetErrorString(err));
return nullptr; return nullptr;
@ -1197,7 +1211,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
CUDA_CHECK(err); CUDA_CHECK(err);
} else { } else {
// reset the error // reset the error
cudaGetLastError(); (void)cudaGetLastError();
} }
} else { } else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other); cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
@ -1205,7 +1219,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
CUDA_CHECK(err); CUDA_CHECK(err);
} else { } else {
// reset the error // reset the error
cudaGetLastError(); (void)cudaGetLastError();
} }
} }
} }
@ -2438,7 +2452,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
if (stat == cudaErrorInvalidDeviceFunction) { if (stat == cudaErrorInvalidDeviceFunction) {
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node. // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
// We don't need to update blas nodes, so clear error and move on. // We don't need to update blas nodes, so clear error and move on.
cudaGetLastError(); (void)cudaGetLastError();
} else { } else {
GGML_ASSERT(stat == cudaSuccess); GGML_ASSERT(stat == cudaSuccess);
} }
@ -2506,7 +2520,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
// The pre-existing graph exec cannot be updated due to violated constraints // The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate // so instead clear error and re-instantiate
cudaGetLastError(); (void)cudaGetLastError();
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance)); CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
cuda_ctx->cuda_graph->instance = nullptr; cuda_ctx->cuda_graph->instance = nullptr;
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
@ -2734,7 +2748,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error
cudaGetLastError(); (void)cudaGetLastError();
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err)); size / 1024.0 / 1024.0, cudaGetErrorString(err));
@ -2754,7 +2768,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
cudaError_t err = cudaHostUnregister(buffer); cudaError_t err = cudaHostUnregister(buffer);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error
cudaGetLastError(); (void)cudaGetLastError();
} }
} }

View file

@ -19,6 +19,12 @@
#define CUBLAS_TF32_TENSOR_OP_MATH 0 #define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_32F HIPBLAS_R_32F #define CUDA_R_32F HIPBLAS_R_32F
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate #define cublasCreate hipblasCreate
@ -74,6 +80,21 @@
#define cudaMemGetInfo hipMemGetInfo #define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice #define cudaSetDevice hipSetDevice
#define cuDeviceGet hipDeviceGet
#define CUdevice hipDevice_t
#define CUdeviceptr hipDeviceptr_t
#define cuMemUnmap hipMemUnmap
#define CUmemAccessDesc hipMemAccessDesc
#define cuMemAddressFree hipMemAddressFree
#define cuMemRelease hipMemRelease
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
#define cuMemCreate hipMemCreate
#define cuMemAddressReserve hipMemAddressReserve
#define cuMemMap hipMemMap
#define cuMemSetAccess hipMemSetAccess
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
#define CUmemAllocationProp hipMemAllocationProp
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy #define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget #define cudaStreamFireAndForget hipStreamFireAndForget

View file

@ -96,6 +96,10 @@ if (GGML_HIP_GRAPHS)
add_compile_definitions(GGML_HIP_GRAPHS) add_compile_definitions(GGML_HIP_GRAPHS)
endif() endif()
if (GGML_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()
if (CXX_IS_HIPCC) if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-hip PRIVATE hip::device) target_link_libraries(ggml-hip PRIVATE hip::device)