Enable VMM on rocm
This commit is contained in:
parent
580b619a07
commit
894b489ada
3 changed files with 58 additions and 19 deletions
|
@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||||
int id = -1; // in case cudaGetDevice fails
|
int id = -1; // in case cudaGetDevice fails
|
||||||
cudaGetDevice(&id);
|
(void)cudaGetDevice(&id);
|
||||||
|
|
||||||
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
||||||
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
||||||
|
@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_CUDA_NO_VMM)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
alloc_prop.location.id = id;
|
alloc_prop.location.id = id;
|
||||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_CUDA_NO_VMM)
|
||||||
info.devices[id].vmm = !!device_vmm;
|
info.devices[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
};
|
};
|
||||||
|
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_CUDA_NO_VMM)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||||
|
|
||||||
|
@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
size_t pool_used = 0;
|
size_t pool_used = 0;
|
||||||
size_t pool_size = 0;
|
size_t pool_size = 0;
|
||||||
size_t granularity;
|
size_t granularity;
|
||||||
|
#if defined(GGML_USE_HIP)
|
||||||
|
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
|
||||||
|
#endif
|
||||||
|
|
||||||
explicit ggml_cuda_pool_vmm(int device) :
|
explicit ggml_cuda_pool_vmm(int device) :
|
||||||
device(device),
|
device(device),
|
||||||
|
@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
|
|
||||||
~ggml_cuda_pool_vmm() {
|
~ggml_cuda_pool_vmm() {
|
||||||
if (pool_addr != 0) {
|
if (pool_addr != 0) {
|
||||||
|
#if defined(GGML_USE_HIP)
|
||||||
|
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
|
||||||
|
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
|
||||||
|
CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
|
||||||
|
}
|
||||||
|
#else
|
||||||
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
||||||
|
#endif
|
||||||
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
}
|
}
|
||||||
|
|
||||||
// map at the end of the pool
|
// map at the end of the pool
|
||||||
CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
|
CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
|
||||||
|
CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
|
||||||
|
#if defined(GGML_USE_HIP)
|
||||||
|
mappings.push_back({start_ptr, reserve_size});
|
||||||
|
#endif
|
||||||
|
|
||||||
// the memory allocation handle is no longer needed after mapping
|
// the memory allocation handle is no longer needed after mapping
|
||||||
CU_CHECK(cuMemRelease(handle));
|
CU_CHECK(cuMemRelease(handle));
|
||||||
|
@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
access.location.id = device;
|
access.location.id = device;
|
||||||
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||||
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
|
CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
|
||||||
|
|
||||||
// add to the pool
|
// add to the pool
|
||||||
pool_size += reserve_size;
|
pool_size += reserve_size;
|
||||||
|
@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
|
|
||||||
GGML_ASSERT(pool_addr != 0);
|
GGML_ASSERT(pool_addr != 0);
|
||||||
|
|
||||||
void * ptr = (void *) (pool_addr + pool_used);
|
void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
|
||||||
*actual_size = size;
|
*actual_size = size;
|
||||||
pool_used += size;
|
pool_used += size;
|
||||||
|
|
||||||
|
@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
pool_used -= size;
|
pool_used -= size;
|
||||||
|
|
||||||
// all deallocations must be in reverse order of the allocations
|
// all deallocations must be in reverse order of the allocations
|
||||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_CUDA_NO_VMM)
|
||||||
|
|
||||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_CUDA_NO_VMM)
|
||||||
if (ggml_cuda_info().devices[device].vmm) {
|
if (ggml_cuda_info().devices[device].vmm) {
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_CUDA_NO_VMM)
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
||||||
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -1197,7 +1211,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||||
CUDA_CHECK(err);
|
CUDA_CHECK(err);
|
||||||
} else {
|
} else {
|
||||||
// reset the error
|
// reset the error
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
||||||
|
@ -1205,7 +1219,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||||
CUDA_CHECK(err);
|
CUDA_CHECK(err);
|
||||||
} else {
|
} else {
|
||||||
// reset the error
|
// reset the error
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2438,7 +2452,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
|
||||||
if (stat == cudaErrorInvalidDeviceFunction) {
|
if (stat == cudaErrorInvalidDeviceFunction) {
|
||||||
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
||||||
// We don't need to update blas nodes, so clear error and move on.
|
// We don't need to update blas nodes, so clear error and move on.
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(stat == cudaSuccess);
|
GGML_ASSERT(stat == cudaSuccess);
|
||||||
}
|
}
|
||||||
|
@ -2506,7 +2520,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||||
|
|
||||||
// The pre-existing graph exec cannot be updated due to violated constraints
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
||||||
// so instead clear error and re-instantiate
|
// so instead clear error and re-instantiate
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
||||||
cuda_ctx->cuda_graph->instance = nullptr;
|
cuda_ctx->cuda_graph->instance = nullptr;
|
||||||
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||||
|
@ -2734,7 +2748,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
|
|
||||||
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||||
|
@ -2754,7 +2768,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||||
cudaError_t err = cudaHostUnregister(buffer);
|
cudaError_t err = cudaHostUnregister(buffer);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
(void)cudaGetLastError();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
21
ggml/src/ggml-cuda/vendors/hip.h
vendored
21
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
@ -19,6 +19,12 @@
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||||
#define CUDA_R_16F HIPBLAS_R_16F
|
#define CUDA_R_16F HIPBLAS_R_16F
|
||||||
#define CUDA_R_32F HIPBLAS_R_32F
|
#define CUDA_R_32F HIPBLAS_R_32F
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
||||||
|
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
||||||
|
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
||||||
|
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
||||||
|
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
||||||
|
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
||||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||||
#define cublasCreate hipblasCreate
|
#define cublasCreate hipblasCreate
|
||||||
|
@ -74,6 +80,21 @@
|
||||||
#define cudaMemGetInfo hipMemGetInfo
|
#define cudaMemGetInfo hipMemGetInfo
|
||||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
#define cudaSetDevice hipSetDevice
|
#define cudaSetDevice hipSetDevice
|
||||||
|
#define cuDeviceGet hipDeviceGet
|
||||||
|
#define CUdevice hipDevice_t
|
||||||
|
#define CUdeviceptr hipDeviceptr_t
|
||||||
|
#define cuMemUnmap hipMemUnmap
|
||||||
|
#define CUmemAccessDesc hipMemAccessDesc
|
||||||
|
#define cuMemAddressFree hipMemAddressFree
|
||||||
|
#define cuMemRelease hipMemRelease
|
||||||
|
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
|
||||||
|
#define cuMemCreate hipMemCreate
|
||||||
|
#define cuMemAddressReserve hipMemAddressReserve
|
||||||
|
#define cuMemMap hipMemMap
|
||||||
|
#define cuMemSetAccess hipMemSetAccess
|
||||||
|
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
|
||||||
|
#define CUmemAllocationProp hipMemAllocationProp
|
||||||
|
#define cuDeviceGetAttribute hipDeviceGetAttribute
|
||||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||||
#define cudaStreamDestroy hipStreamDestroy
|
#define cudaStreamDestroy hipStreamDestroy
|
||||||
#define cudaStreamFireAndForget hipStreamFireAndForget
|
#define cudaStreamFireAndForget hipStreamFireAndForget
|
||||||
|
|
|
@ -96,6 +96,10 @@ if (GGML_HIP_GRAPHS)
|
||||||
add_compile_definitions(GGML_HIP_GRAPHS)
|
add_compile_definitions(GGML_HIP_GRAPHS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA_NO_VMM)
|
||||||
|
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (CXX_IS_HIPCC)
|
if (CXX_IS_HIPCC)
|
||||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||||
target_link_libraries(ggml-hip PRIVATE hip::device)
|
target_link_libraries(ggml-hip PRIVATE hip::device)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue