check for vmm support, disable for hip
ggml-ci
This commit is contained in:
parent
bd78dc9aee
commit
872408cfb7
1 changed files with 48 additions and 27 deletions
75
ggml-cuda.cu
75
ggml-cuda.cu
|
@ -6564,18 +6564,16 @@ struct scoped_spin_lock {
|
||||||
|
|
||||||
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
||||||
|
|
||||||
#if 0
|
// #define DEBUG_CUDA_MALLOC
|
||||||
#define DEBUG_CUDA_MALLOC
|
|
||||||
struct cuda_buffer {
|
struct cuda_buffer {
|
||||||
void * ptr = nullptr;
|
void * ptr = nullptr;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
|
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
|
||||||
|
|
||||||
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
|
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
|
||||||
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||||
CUDA_CHECK(cudaFree(ptr));
|
CUDA_CHECK(cudaFree(ptr));
|
||||||
g_cuda_pool_size[id] -= size;
|
g_cuda_pool_size[id] -= size;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
|
|
||||||
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
|
// pool with virtual memory
|
||||||
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
|
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
|
||||||
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
|
|
||||||
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
||||||
|
|
||||||
static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36; // 64 GB
|
static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
|
||||||
|
|
||||||
//#define DEBUG_CUDA_MALLOC
|
|
||||||
|
|
||||||
#define ggml_cuda_pool_malloc(size, actual_size) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
|
|
||||||
static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const char * call) {
|
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
@ -6681,14 +6675,14 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
|
||||||
// round up to the nearest granularity
|
// round up to the nearest granularity
|
||||||
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
||||||
|
|
||||||
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE);
|
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
||||||
|
|
||||||
CUmemGenericAllocationHandle handle;
|
CUmemGenericAllocationHandle handle;
|
||||||
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
|
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
|
||||||
|
|
||||||
// reserve virtual address space (if not already reserved)
|
// reserve virtual address space (if not already reserved)
|
||||||
if (g_cuda_pool_addr[id] == 0) {
|
if (g_cuda_pool_addr[id] == 0) {
|
||||||
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE, 0, 0, 0));
|
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
// map at the end of the pool
|
// map at the end of the pool
|
||||||
|
@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
|
||||||
g_cuda_pool_handles[id].push_back(handle);
|
g_cuda_pool_handles[id].push_back(handle);
|
||||||
g_cuda_pool_size[id] += reserve_size;
|
g_cuda_pool_size[id] += reserve_size;
|
||||||
|
|
||||||
printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s]\n",
|
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||||
id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
|
// id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
|
||||||
(unsigned long long) (reserve_size/1024/1024), call);
|
// (unsigned long long) (reserve_size/1024/1024));
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(g_cuda_pool_addr[id] != 0);
|
GGML_ASSERT(g_cuda_pool_addr[id] != 0);
|
||||||
|
@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
|
||||||
g_cuda_pool_used[id] += size;
|
g_cuda_pool_used[id] += size;
|
||||||
|
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
|
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
|
|
||||||
GGML_UNUSED(call);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ggml_cuda_pool_free(ptr, size) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
|
static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
|
||||||
static void ggml_cuda_pool_free_(void * ptr, size_t size, const char * call) {
|
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
printf("cuda pool[%d]: free %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
|
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
g_cuda_pool_used[id] -= size;
|
g_cuda_pool_used[id] -= size;
|
||||||
|
|
||||||
// all deallocations must be in reverse order of the allocations
|
// all deallocations must be in reverse order of the allocations
|
||||||
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
|
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
|
||||||
|
|
||||||
GGML_UNUSED(call);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false};
|
||||||
|
|
||||||
|
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
if (g_device_vmm[id]) {
|
||||||
|
return ggml_cuda_pool_malloc_vmm(size, actual_size);
|
||||||
|
} else {
|
||||||
|
return ggml_cuda_pool_malloc_leg(size, actual_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||||
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
if (g_device_vmm[id]) {
|
||||||
|
ggml_cuda_pool_free_vmm(ptr, size);
|
||||||
|
} else {
|
||||||
|
ggml_cuda_pool_free_leg(ptr, size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
|
||||||
|
#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static bool g_cublas_loaded = false;
|
static bool g_cublas_loaded = false;
|
||||||
|
@ -6783,9 +6796,17 @@ void ggml_init_cublas() {
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
|
int deviceSupportsVmm = 0;
|
||||||
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
|
CUdevice device;
|
||||||
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
|
CU_CHECK(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
#endif
|
||||||
|
g_device_vmm[id] = !!deviceSupportsVmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||||
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, g_device_vmm[id] ? "yes" : "no");
|
||||||
|
|
||||||
g_tensor_split[id] = total_vram;
|
g_tensor_split[id] = total_vram;
|
||||||
total_vram += prop.totalGlobalMem;
|
total_vram += prop.totalGlobalMem;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue