From 1a07f604516f2a3643b23ede4e11e75055d54387 Mon Sep 17 00:00:00 2001
From: Markus Tavenrath <matavenrath@nvidia.com>
Date: Tue, 23 Apr 2024 12:00:10 +0200
Subject: [PATCH] Cleanup tweaks and DSC class. The file copy raid
 functionality is not protected by an named ifdef

---
 ggml-cuda.cu      | 86 +++++++++++++++++++++++++++++++++++++++++------
 ggml-cuda/dsc.cpp | 71 ++++++++++++++++++--------------------
 ggml-cuda/dsc.h   |  3 ++
 3 files changed, 112 insertions(+), 48 deletions(-)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 8e1207697..b2d974080 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -379,13 +379,35 @@ struct ggml_backend_cuda_buffer_context {
     int device;
     void * dev_ptr = nullptr;
     std::string name;
+#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
+    std::unique_ptr<InteropBuffer> direct_storage_buffer;
+#endif
 
     ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
         device(device), dev_ptr(dev_ptr),
         name(GGML_CUDA_NAME + std::to_string(device)) {
+
     }
 
+#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
+    ggml_backend_cuda_buffer_context(int device, std::unique_ptr<InteropBuffer> && direct_storage_buffer_) :
+        device(device), dev_ptr(nullptr),
+        name(GGML_CUDA_NAME + std::to_string(device)),
+        direct_storage_buffer(std::move(direct_storage_buffer_))
+    {
+        dev_ptr = direct_storage_buffer->get_device_ptr();
+    }
+#endif
+
+
     ~ggml_backend_cuda_buffer_context() {
+#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
+        if (direct_storage_buffer) {
+            direct_storage_buffer.reset();
+            dev_ptr = nullptr;
+        }
+#endif
+
         CUDA_CHECK(cudaFree(dev_ptr));
     }
 };
@@ -451,7 +473,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
     if (size & (1u << 31)) {
         size &= ~(1u << 31);
         if (data == nullptr) {
-            dsc->flush();
+            std::cout << "flush" << std::endl;
+            dsc->flush(true);
             return;
         }
         struct Temp {
@@ -465,7 +488,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
         if (it == files.end()) {
             files[filename].handles.push_back(dsc->openFile(filename));
 
-#if 0
+#define COPY_RAID
+#if defined(COPY_RAID)
             // This is a hack to evaluate how fast data can be read from a 2nd disk.
             std::filesystem::path p(filename);
             std::filesystem::path p2("d:");
@@ -482,7 +506,34 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t
             it = files.find(filename);
         }
 
-        dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset);
+        //dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset);
+        if (ctx->direct_storage_buffer) {
+            size_t tensor_offset = (char*)tensor->data - (char*)ctx->direct_storage_buffer->get_device_ptr();
+#if defined(COPY_RAID)
+            size_t blocksize = 4 * 1024 * 1024;
+            for (size_t idx = 0; idx < size; idx += blocksize) {
+                size_t read_len = size - idx;
+                if (read_len > blocksize)
+                    read_len = blocksize;
+                dsc->loadFile(it->second.getFile(), t->weights_off + idx, read_len, ctx->direct_storage_buffer.get(), offset + tensor_offset + idx);
+            }
+#else
+            dsc->loadFile(it->second.getFile(), t->weights_off, size, ctx->direct_storage_buffer.get(), offset + tensor_offset);
+#endif
+        }
+        else {
+#if defined(COPY_RAID)
+            size_t blocksize = 2 * 1024 * 1024;
+            for (size_t idx = 0; idx < size; idx += blocksize) {
+                size_t read_len = size - idx;
+                if (read_len > blocksize)
+                    read_len = blocksize;
+                dsc->loadFile(it->second.getFile(), t->weights_off + idx, read_len, (char*)tensor->data + offset + idx);
+        }
+#else
+            dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset);
+#endif
+        }
     }
     else
 #endif
@@ -561,15 +612,20 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
 
     size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
 
-    void * dev_ptr;
+#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
+    if (size < 512 * 1024 * 1024) {
+        auto interop_buffer = dsc->create_interop_buffer(size);
+        ggml_backend_cuda_buffer_context* ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, std::move(interop_buffer));
+        return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
+    }
+#endif
+    void* dev_ptr;
     cudaError_t err = cudaMalloc(&dev_ptr, size);
     if (err != cudaSuccess) {
-        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
+        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
         return nullptr;
     }
-
-    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
-
+    ggml_backend_cuda_buffer_context* ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
     return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }
 
@@ -605,11 +661,21 @@ GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backen
     return buft_ctx->device == cuda_ctx->device;
 }
 
+GGML_CALL static size_t ggml_backend_cuda_get_max_size(ggml_backend_buffer_type_t buft)
+{
+#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
+    //return 512 * 1024 * 1024; // dx interop limit
+    return SIZE_MAX;
+#else
+    return SIZE_MAX;
+#endif
+}// allocation max size
+
 static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_max_size     = */ ggml_backend_cuda_get_max_size, // defaults to SIZE_MAX
     /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
     /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
     /* .is_host          = */ NULL,
@@ -774,7 +840,7 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
 
 static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
     return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
-    GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
+    GGML_UNUSED(&ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
 }
 
 GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
diff --git a/ggml-cuda/dsc.cpp b/ggml-cuda/dsc.cpp
index e390cd588..3a47f0f02 100644
--- a/ggml-cuda/dsc.cpp
+++ b/ggml-cuda/dsc.cpp
@@ -124,14 +124,16 @@ public:
     D3D12_HEAP_PROPERTIES bufferHeapProps = {};
     bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
 
+#define USE_HEAP
 
+#if defined(USE_HEAP)
     D3D12_HEAP_DESC hd = {};
     hd.SizeInBytes = size;
     hd.Properties = bufferHeapProps;
     hd.Flags = D3D12_HEAP_FLAG_SHARED;
     hd.Alignment = 0;
     d3d_device->CreateHeap(&hd, IID_PPV_ARGS(&m_d3d_heap));
-
+#endif
 
     D3D12_RESOURCE_DESC bufferDesc = {};
     bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
@@ -143,27 +145,25 @@ public:
     bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
     bufferDesc.SampleDesc.Count = 1;
 
-//#define USE_BUFFER
-#if defined(USE_BUFFER) // 
-    winrt::check_hresult(d3d_device->CreateCommittedResource(
-      &bufferHeapProps,
-      D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED,
-      &bufferDesc,
-      D3D12_RESOURCE_STATE_COMMON,
-      nullptr,
-      IID_PPV_ARGS(m_d3d_buffer.put())));
-#else
+#if defined(USE_HEAP)
     winrt::check_hresult(d3d_device->CreatePlacedResource(
-      m_d3d_heap.get(),
-      0,
-      &bufferDesc,
-      D3D12_RESOURCE_STATE_COMMON,
-      nullptr,
-      IID_PPV_ARGS(m_d3d_buffer.put())));
+        m_d3d_heap.get(),
+        0,
+        &bufferDesc,
+        D3D12_RESOURCE_STATE_COMMON,
+        nullptr,
+        IID_PPV_ARGS(m_d3d_buffer.put())));
+#else
+    winrt::check_hresult(d3d_device->CreateCommittedResource(
+        &bufferHeapProps,
+        D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED,
+        &bufferDesc,
+        D3D12_RESOURCE_STATE_COMMON,
+        nullptr,
+        IID_PPV_ARGS(m_d3d_buffer.put())));
 #endif
 
-#if 0
-    // debug begin
+#if defined(DEBUG_READBACK)
     bufferHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
     winrt::check_hresult(d3d_device->CreateCommittedResource(
       &bufferHeapProps,
@@ -176,27 +176,26 @@ public:
     m_host_buffer->Map(0, nullptr, &m_host_ptr);
     d3d_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_cmdallocator));
     d3d_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_cmdallocator.get(), nullptr, IID_PPV_ARGS(&m_cmdlist));
-#endif
 
     D3D12_COMMAND_QUEUE_DESC qd = {};
     qd.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
     d3d_device->CreateCommandQueue(&qd, IID_PPV_ARGS(&m_cmd_queue));
-    // debug end
+#endif
 
     // create a shared handle to require to import the d3d buffer into CUDA
     HANDLE sharedHandle;
     WindowsSecurityAttributes windowsSecurityAttributes;
     LPCWSTR name = NULL;
-#if USE_BUFFER
-    d3d_device->CreateSharedHandle(m_d3d_buffer.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
-
-    cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
-    externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
-#else
+#if defined(USE_HEAP)
     d3d_device->CreateSharedHandle(m_d3d_heap.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
 
     cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
     externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap;
+#else
+    d3d_device->CreateSharedHandle(m_d3d_buffer.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
+
+    cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
+    externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
 #endif
     externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
     externalMemoryHandleDesc.size = bufferDesc.Width;
@@ -212,11 +211,6 @@ public:
     externalMemoryBufferDesc.flags = 0;
 
     result = cudaExternalMemoryGetMappedBuffer(&m_cuda_dev_ptr, m_externalMemory, &externalMemoryBufferDesc);
-    result = cudaDeviceSynchronize();
-
-    auto err = cudaMemset(m_cuda_dev_ptr, 255, 512*1024*1024);
-    result = cudaDeviceSynchronize();
-    std::cout << "err: " << err << std::endl;
   }
 
   ~InteropBufferImpl() {
@@ -235,19 +229,19 @@ public:
     return m_d3d_buffer.get();
   }
 
+#if defined(DEBUG_READBACK)
   void* get_host_ptr() const {
-#if 0
     m_cmdlist->Reset(m_cmdallocator.get(), nullptr);
     m_cmdlist->CopyResource(m_host_buffer.get(), m_d3d_buffer.get());
     m_cmdlist->Close();
 
     ID3D12CommandList *ptr = m_cmdlist.get();
     m_cmd_queue->ExecuteCommandLists(1, &ptr);
-    Sleep(2);
-#endif
+    Sleep(2); // actually one would have to wait for an event here
 
     return m_host_ptr;
   }
+#endif
 
 private:
   winrt::com_ptr<ID3D12CommandQueue> m_cmd_queue = {};
@@ -257,11 +251,12 @@ private:
   cudaExternalMemory_t m_externalMemory;
   void* m_cuda_dev_ptr;
 
-  // debug
+#if defined(DEBUG_READBACK)
   winrt::com_ptr<ID3D12Resource> m_host_buffer = {};
   winrt::com_ptr<ID3D12GraphicsCommandList> m_cmdlist = {};
   winrt::com_ptr<ID3D12CommandAllocator> m_cmdallocator = {};
   void* m_host_ptr;
+#endif
 };
 
 class DirectStorageCUDAImpl : public DirectStorageCUDA
@@ -450,6 +445,7 @@ private:
         InteropBufferImpl* ibi = static_cast<InteropBufferImpl*>(interop_buffer);
         bool flushed;
         while (read_len) {
+            //std::cout << file.get() << std::endl;
           size_t request_size = min(m_chunk_size, read_len);
 
           DSTORAGE_REQUEST request = {};
@@ -462,7 +458,6 @@ private:
           request.Destination.Buffer.Resource = ibi->get_d3d_buffer();
           request.Destination.Buffer.Offset = interop_buffer_offset;
           request.Destination.Buffer.Size = request_size;
-          //std::cout << read_start / (1024*1024) << " / " << interop_buffer_offset / (1024 * 1024) << "/" << request_size / (1024 * 1024) << std::endl;
 
           m_d3d_storage_queue->EnqueueRequest(&request);
 
@@ -471,7 +466,7 @@ private:
           read_start += request_size;
 
           m_enqueued = true;
-          //flush(true);
+          //flush(false); // flushing less often improves perf a little bit, but removes ability to track current load status
         };
 
       }
diff --git a/ggml-cuda/dsc.h b/ggml-cuda/dsc.h
index 95a8430d5..e5f9088a4 100644
--- a/ggml-cuda/dsc.h
+++ b/ggml-cuda/dsc.h
@@ -34,7 +34,10 @@ class InteropBuffer {
 public:
   virtual ~InteropBuffer() = 0;
   virtual void* get_device_ptr() const = 0;
+
+#if defined(DEBUG_READBACK)
   virtual void* get_host_ptr() const = 0;
+#endif
 };
 
 class DirectStorageCUDA