check for vmm support, disable for hip

ggml-ci
2023-12-23 02:29:21 +01:00 · 2023-12-23 02:29:21 +01:00 · 872408cfb7
commit 872408cfb7
parent bd78dc9aee
1 changed files with 48 additions and 27 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6564,18 +6564,16 @@ struct scoped_spin_lock {

 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;

-#if 0
-#define DEBUG_CUDA_MALLOC
+// #define DEBUG_CUDA_MALLOC
 struct cuda_buffer {
    void * ptr = nullptr;
    size_t size = 0;
 };

 static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-
 static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};

-static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
    return ptr;
 }

-static void ggml_cuda_pool_free(void * ptr, size_t size) {
+static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
    CUDA_CHECK(cudaFree(ptr));
    g_cuda_pool_size[id] -= size;
 }
-#else

+#if !defined(GGML_USE_HIPBLAS)
+// pool with virtual memory
 static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
 static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
-static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB

-static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36; // 64 GB
-
-//#define DEBUG_CUDA_MALLOC
-
-#define ggml_cuda_pool_malloc(size, actual_size) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
-static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const char * call) {
+static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
@ -6681,14 +6675,14 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
        // round up to the nearest granularity
        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);

-        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE);
+        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);

        CUmemGenericAllocationHandle handle;
        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));

        // reserve virtual address space (if not already reserved)
        if (g_cuda_pool_addr[id] == 0) {
-            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE, 0, 0, 0));
+            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
        }

        // map at the end of the pool
@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
        g_cuda_pool_handles[id].push_back(handle);
        g_cuda_pool_size[id] += reserve_size;

-        printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s]\n",
-               id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
-               (unsigned long long) (reserve_size/1024/1024), call);
+        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
+        //       (unsigned long long) (reserve_size/1024/1024));
    }

    GGML_ASSERT(g_cuda_pool_addr[id] != 0);
@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
    g_cuda_pool_used[id] += size;

 #ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
+    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
 #endif

    return ptr;
-
-    GGML_UNUSED(call);
 }

-#define ggml_cuda_pool_free(ptr, size) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
-static void ggml_cuda_pool_free_(void * ptr, size_t size, const char * call) {
+static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));

 #ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: free %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
+    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
 #endif

    g_cuda_pool_used[id] -= size;

    // all deallocations must be in reverse order of the allocations
    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
-
-    GGML_UNUSED(call);
 }

+static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false};
+
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    if (g_device_vmm[id]) {
+        return ggml_cuda_pool_malloc_vmm(size, actual_size);
+    } else {
+        return ggml_cuda_pool_malloc_leg(size, actual_size);
+    }
+}
+
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    if (g_device_vmm[id]) {
+        ggml_cuda_pool_free_vmm(ptr, size);
+    } else {
+        ggml_cuda_pool_free_leg(ptr, size);
+    }
+}
+#else
+#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
+#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
 #endif

 static bool g_cublas_loaded = false;
@ -6783,9 +6796,17 @@ void ggml_init_cublas() {
 #endif
        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
        for (int id = 0; id < g_device_count; ++id) {
+            int deviceSupportsVmm = 0;
+#if !defined(GGML_USE_HIPBLAS)
+            CUdevice device;
+            CU_CHECK(cuDeviceGet(&device, id));
+            CU_CHECK(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
+#endif
+            g_device_vmm[id] = !!deviceSupportsVmm;
+
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, g_device_vmm[id] ? "yes" : "no");

            g_tensor_split[id] = total_vram;
            total_vram += prop.totalGlobalMem;