From 1a07f604516f2a3643b23ede4e11e75055d54387 Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Tue, 23 Apr 2024 12:00:10 +0200 Subject: [PATCH] Cleanup tweaks and DSC class. The file copy raid functionality is not protected by an named ifdef --- ggml-cuda.cu | 86 +++++++++++++++++++++++++++++++++++++++++------ ggml-cuda/dsc.cpp | 71 ++++++++++++++++++-------------------- ggml-cuda/dsc.h | 3 ++ 3 files changed, 112 insertions(+), 48 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 8e1207697..b2d974080 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -379,13 +379,35 @@ struct ggml_backend_cuda_buffer_context { int device; void * dev_ptr = nullptr; std::string name; +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + std::unique_ptr direct_storage_buffer; +#endif ggml_backend_cuda_buffer_context(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr), name(GGML_CUDA_NAME + std::to_string(device)) { + } +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + ggml_backend_cuda_buffer_context(int device, std::unique_ptr && direct_storage_buffer_) : + device(device), dev_ptr(nullptr), + name(GGML_CUDA_NAME + std::to_string(device)), + direct_storage_buffer(std::move(direct_storage_buffer_)) + { + dev_ptr = direct_storage_buffer->get_device_ptr(); + } +#endif + + ~ggml_backend_cuda_buffer_context() { +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + if (direct_storage_buffer) { + direct_storage_buffer.reset(); + dev_ptr = nullptr; + } +#endif + CUDA_CHECK(cudaFree(dev_ptr)); } }; @@ -451,7 +473,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t if (size & (1u << 31)) { size &= ~(1u << 31); if (data == nullptr) { - dsc->flush(); + std::cout << "flush" << std::endl; + dsc->flush(true); return; } struct Temp { @@ -465,7 +488,8 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t if (it == files.end()) { files[filename].handles.push_back(dsc->openFile(filename)); -#if 0 +#define COPY_RAID +#if defined(COPY_RAID) // This is a hack to evaluate how fast data can be read from a 2nd disk. std::filesystem::path p(filename); std::filesystem::path p2("d:"); @@ -482,7 +506,34 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t it = files.find(filename); } - dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset); + //dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset); + if (ctx->direct_storage_buffer) { + size_t tensor_offset = (char*)tensor->data - (char*)ctx->direct_storage_buffer->get_device_ptr(); +#if defined(COPY_RAID) + size_t blocksize = 4 * 1024 * 1024; + for (size_t idx = 0; idx < size; idx += blocksize) { + size_t read_len = size - idx; + if (read_len > blocksize) + read_len = blocksize; + dsc->loadFile(it->second.getFile(), t->weights_off + idx, read_len, ctx->direct_storage_buffer.get(), offset + tensor_offset + idx); + } +#else + dsc->loadFile(it->second.getFile(), t->weights_off, size, ctx->direct_storage_buffer.get(), offset + tensor_offset); +#endif + } + else { +#if defined(COPY_RAID) + size_t blocksize = 2 * 1024 * 1024; + for (size_t idx = 0; idx < size; idx += blocksize) { + size_t read_len = size - idx; + if (read_len > blocksize) + read_len = blocksize; + dsc->loadFile(it->second.getFile(), t->weights_off + idx, read_len, (char*)tensor->data + offset + idx); + } +#else + dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset); +#endif + } } else #endif @@ -561,15 +612,20 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 - void * dev_ptr; +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + if (size < 512 * 1024 * 1024) { + auto interop_buffer = dsc->create_interop_buffer(size); + ggml_backend_cuda_buffer_context* ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, std::move(interop_buffer)); + return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); + } +#endif + void* dev_ptr; cudaError_t err = cudaMalloc(&dev_ptr, size); if (err != cudaSuccess) { - fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); + fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err)); return nullptr; } - - ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); - + ggml_backend_cuda_buffer_context* ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } @@ -605,11 +661,21 @@ GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backen return buft_ctx->device == cuda_ctx->device; } +GGML_CALL static size_t ggml_backend_cuda_get_max_size(ggml_backend_buffer_type_t buft) +{ +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + //return 512 * 1024 * 1024; // dx interop limit + return SIZE_MAX; +#else + return SIZE_MAX; +#endif +}// allocation max size + static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_max_size = */ ggml_backend_cuda_get_max_size, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .is_host = */ NULL, @@ -774,7 +840,7 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; - GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds + GGML_UNUSED(&ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds } GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { diff --git a/ggml-cuda/dsc.cpp b/ggml-cuda/dsc.cpp index e390cd588..3a47f0f02 100644 --- a/ggml-cuda/dsc.cpp +++ b/ggml-cuda/dsc.cpp @@ -124,14 +124,16 @@ public: D3D12_HEAP_PROPERTIES bufferHeapProps = {}; bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT; +#define USE_HEAP +#if defined(USE_HEAP) D3D12_HEAP_DESC hd = {}; hd.SizeInBytes = size; hd.Properties = bufferHeapProps; hd.Flags = D3D12_HEAP_FLAG_SHARED; hd.Alignment = 0; d3d_device->CreateHeap(&hd, IID_PPV_ARGS(&m_d3d_heap)); - +#endif D3D12_RESOURCE_DESC bufferDesc = {}; bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; @@ -143,27 +145,25 @@ public: bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; bufferDesc.SampleDesc.Count = 1; -//#define USE_BUFFER -#if defined(USE_BUFFER) // - winrt::check_hresult(d3d_device->CreateCommittedResource( - &bufferHeapProps, - D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED, - &bufferDesc, - D3D12_RESOURCE_STATE_COMMON, - nullptr, - IID_PPV_ARGS(m_d3d_buffer.put()))); -#else +#if defined(USE_HEAP) winrt::check_hresult(d3d_device->CreatePlacedResource( - m_d3d_heap.get(), - 0, - &bufferDesc, - D3D12_RESOURCE_STATE_COMMON, - nullptr, - IID_PPV_ARGS(m_d3d_buffer.put()))); + m_d3d_heap.get(), + 0, + &bufferDesc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(m_d3d_buffer.put()))); +#else + winrt::check_hresult(d3d_device->CreateCommittedResource( + &bufferHeapProps, + D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED, + &bufferDesc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(m_d3d_buffer.put()))); #endif -#if 0 - // debug begin +#if defined(DEBUG_READBACK) bufferHeapProps.Type = D3D12_HEAP_TYPE_READBACK; winrt::check_hresult(d3d_device->CreateCommittedResource( &bufferHeapProps, @@ -176,27 +176,26 @@ public: m_host_buffer->Map(0, nullptr, &m_host_ptr); d3d_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_cmdallocator)); d3d_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_cmdallocator.get(), nullptr, IID_PPV_ARGS(&m_cmdlist)); -#endif D3D12_COMMAND_QUEUE_DESC qd = {}; qd.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; d3d_device->CreateCommandQueue(&qd, IID_PPV_ARGS(&m_cmd_queue)); - // debug end +#endif // create a shared handle to require to import the d3d buffer into CUDA HANDLE sharedHandle; WindowsSecurityAttributes windowsSecurityAttributes; LPCWSTR name = NULL; -#if USE_BUFFER - d3d_device->CreateSharedHandle(m_d3d_buffer.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle); - - cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; -#else +#if defined(USE_HEAP) d3d_device->CreateSharedHandle(m_d3d_heap.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle); cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap; +#else + d3d_device->CreateSharedHandle(m_d3d_buffer.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle); + + cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; #endif externalMemoryHandleDesc.handle.win32.handle = sharedHandle; externalMemoryHandleDesc.size = bufferDesc.Width; @@ -212,11 +211,6 @@ public: externalMemoryBufferDesc.flags = 0; result = cudaExternalMemoryGetMappedBuffer(&m_cuda_dev_ptr, m_externalMemory, &externalMemoryBufferDesc); - result = cudaDeviceSynchronize(); - - auto err = cudaMemset(m_cuda_dev_ptr, 255, 512*1024*1024); - result = cudaDeviceSynchronize(); - std::cout << "err: " << err << std::endl; } ~InteropBufferImpl() { @@ -235,19 +229,19 @@ public: return m_d3d_buffer.get(); } +#if defined(DEBUG_READBACK) void* get_host_ptr() const { -#if 0 m_cmdlist->Reset(m_cmdallocator.get(), nullptr); m_cmdlist->CopyResource(m_host_buffer.get(), m_d3d_buffer.get()); m_cmdlist->Close(); ID3D12CommandList *ptr = m_cmdlist.get(); m_cmd_queue->ExecuteCommandLists(1, &ptr); - Sleep(2); -#endif + Sleep(2); // actually one would have to wait for an event here return m_host_ptr; } +#endif private: winrt::com_ptr m_cmd_queue = {}; @@ -257,11 +251,12 @@ private: cudaExternalMemory_t m_externalMemory; void* m_cuda_dev_ptr; - // debug +#if defined(DEBUG_READBACK) winrt::com_ptr m_host_buffer = {}; winrt::com_ptr m_cmdlist = {}; winrt::com_ptr m_cmdallocator = {}; void* m_host_ptr; +#endif }; class DirectStorageCUDAImpl : public DirectStorageCUDA @@ -450,6 +445,7 @@ private: InteropBufferImpl* ibi = static_cast(interop_buffer); bool flushed; while (read_len) { + //std::cout << file.get() << std::endl; size_t request_size = min(m_chunk_size, read_len); DSTORAGE_REQUEST request = {}; @@ -462,7 +458,6 @@ private: request.Destination.Buffer.Resource = ibi->get_d3d_buffer(); request.Destination.Buffer.Offset = interop_buffer_offset; request.Destination.Buffer.Size = request_size; - //std::cout << read_start / (1024*1024) << " / " << interop_buffer_offset / (1024 * 1024) << "/" << request_size / (1024 * 1024) << std::endl; m_d3d_storage_queue->EnqueueRequest(&request); @@ -471,7 +466,7 @@ private: read_start += request_size; m_enqueued = true; - //flush(true); + //flush(false); // flushing less often improves perf a little bit, but removes ability to track current load status }; } diff --git a/ggml-cuda/dsc.h b/ggml-cuda/dsc.h index 95a8430d5..e5f9088a4 100644 --- a/ggml-cuda/dsc.h +++ b/ggml-cuda/dsc.h @@ -34,7 +34,10 @@ class InteropBuffer { public: virtual ~InteropBuffer() = 0; virtual void* get_device_ptr() const = 0; + +#if defined(DEBUG_READBACK) virtual void* get_host_ptr() const = 0; +#endif }; class DirectStorageCUDA