check for vmm support, disable for hip

ggml-ci
2023-12-23 02:29:21 +01:00 · 2023-12-23 02:29:21 +01:00 · 872408cfb7
commit 872408cfb7
parent bd78dc9aee
1 changed files with 48 additions and 27 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6564,18 +6564,16 @@ struct scoped_spin_lock {
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
-#if 0
+// #define DEBUG_CUDA_MALLOC
 #define DEBUG_CUDA_MALLOC
 struct cuda_buffer {
    void * ptr = nullptr;
    size_t size = 0;
 };
 static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
 static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
-static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
    return ptr;
 }
-static void ggml_cuda_pool_free(void * ptr, size_t size) {
+static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
    CUDA_CHECK(cudaFree(ptr));
    g_cuda_pool_size[id] -= size;
 }
 #else
 #if !defined(GGML_USE_HIPBLAS)
 // pool with virtual memory
 static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
 static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
 static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
-static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36; // 64 GB
+static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
 //#define DEBUG_CUDA_MALLOC
 #define ggml_cuda_pool_malloc(size, actual_size) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
 static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const char * call) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
@ -6681,14 +6675,14 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
        // round up to the nearest granularity
        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE);
+        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
        CUmemGenericAllocationHandle handle;
        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
        // reserve virtual address space (if not already reserved)
        if (g_cuda_pool_addr[id] == 0) {
-            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE, 0, 0, 0));
+            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
        }
        // map at the end of the pool
@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
        g_cuda_pool_handles[id].push_back(handle);
        g_cuda_pool_size[id] += reserve_size;
-        printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s]\n",
+        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-               id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
+        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
-               (unsigned long long) (reserve_size/1024/1024), call);
+        //       (unsigned long long) (reserve_size/1024/1024));
    }
    GGML_ASSERT(g_cuda_pool_addr[id] != 0);
@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
    g_cuda_pool_used[id] += size;
 #ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
+    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
 #endif
    return ptr;
    GGML_UNUSED(call);
 }
-#define ggml_cuda_pool_free(ptr, size) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
+static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
 static void ggml_cuda_pool_free_(void * ptr, size_t size, const char * call) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
 #ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: free %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
+    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
 #endif
    g_cuda_pool_used[id] -= size;
    // all deallocations must be in reverse order of the allocations
    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
    GGML_UNUSED(call);
 }
 static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false};
 static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    if (g_device_vmm[id]) {
        return ggml_cuda_pool_malloc_vmm(size, actual_size);
    } else {
        return ggml_cuda_pool_malloc_leg(size, actual_size);
    }
 }
 static void ggml_cuda_pool_free(void * ptr, size_t size) {
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    if (g_device_vmm[id]) {
        ggml_cuda_pool_free_vmm(ptr, size);
    } else {
        ggml_cuda_pool_free_leg(ptr, size);
    }
 }
 #else
 #define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
 #define ggml_cuda_pool_free ggml_cuda_pool_free_leg
 #endif
 static bool g_cublas_loaded = false;
@ -6783,9 +6796,17 @@ void ggml_init_cublas() {
 #endif
        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
        for (int id = 0; id < g_device_count; ++id) {
            int deviceSupportsVmm = 0;
 #if !defined(GGML_USE_HIPBLAS)
            CUdevice device;
            CU_CHECK(cuDeviceGet(&device, id));
            CU_CHECK(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
 #endif
            g_device_vmm[id] = !!deviceSupportsVmm;
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, g_device_vmm[id] ? "yes" : "no");
            g_tensor_split[id] = total_vram;
            total_vram += prop.totalGlobalMem;