diff --git a/ggml-cuda/dsc.cpp b/ggml-cuda/dsc.cpp new file mode 100644 index 000000000..e390cd588 --- /dev/null +++ b/ggml-cuda/dsc.cpp @@ -0,0 +1,652 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "dsc.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // WindowsSecurityAttributes +#include +#include +#include + +#include + + +class WindowsSecurityAttributes +{ +protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes = {}; + SECURITY_DESCRIPTOR m_securityDescriptor = {}; + PSID pSID = 0; + PACL pACL = 0; + +public: + WindowsSecurityAttributes() + { + InitializeSecurityDescriptor(&m_securityDescriptor, SECURITY_DESCRIPTOR_REVISION); + + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, &pSID); + + EXPLICIT_ACCESS explicitAccess = {}; + explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = reinterpret_cast(pSID); + + SetEntriesInAcl(1, &explicitAccess, NULL, &pACL); + SetSecurityDescriptorDacl(&m_securityDescriptor, TRUE, pACL, FALSE); + + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = &m_securityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; + } + + WindowsSecurityAttributes(WindowsSecurityAttributes const& rhs) = delete; + WindowsSecurityAttributes(WindowsSecurityAttributes const&& rhs) = delete; + + ~WindowsSecurityAttributes() { + if (pSID) + { + FreeSid(pSID); + } + if (pACL) + { + LocalFree(pACL); + } + } + + operator SECURITY_ATTRIBUTES const* () const { + return &m_winSecurityAttributes; + } +}; + +DirectStorageCUDA::~DirectStorageCUDA() +{ +} + +struct DirectStorageCUDAFileHandleImpl : DirectStorageCUDAFileHandle +{ + ~DirectStorageCUDAFileHandleImpl() {}; + + using File = winrt::com_ptr; + File file; + + IDStorageFile* get() { return file.get(); } + IDStorageFile** put() { return file.put(); } +}; + +InteropBuffer::~InteropBuffer() +{ +} + +class InteropBufferImpl : public InteropBuffer +{ +public: + InteropBufferImpl(winrt::com_ptr const& d3d_device, size_t size) + { + + // Create the ID3D12Resource buffer which will be used as temporary scratch space for d3d + // since it's not possible to import CUDA memory into DX. + D3D12_HEAP_PROPERTIES bufferHeapProps = {}; + bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT; + + + D3D12_HEAP_DESC hd = {}; + hd.SizeInBytes = size; + hd.Properties = bufferHeapProps; + hd.Flags = D3D12_HEAP_FLAG_SHARED; + hd.Alignment = 0; + d3d_device->CreateHeap(&hd, IID_PPV_ARGS(&m_d3d_heap)); + + + D3D12_RESOURCE_DESC bufferDesc = {}; + bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + bufferDesc.Width = size; + bufferDesc.Height = 1; + bufferDesc.DepthOrArraySize = 1; + bufferDesc.MipLevels = 1; + bufferDesc.Format = DXGI_FORMAT_UNKNOWN; + bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + bufferDesc.SampleDesc.Count = 1; + +//#define USE_BUFFER +#if defined(USE_BUFFER) // + winrt::check_hresult(d3d_device->CreateCommittedResource( + &bufferHeapProps, + D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED, + &bufferDesc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(m_d3d_buffer.put()))); +#else + winrt::check_hresult(d3d_device->CreatePlacedResource( + m_d3d_heap.get(), + 0, + &bufferDesc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(m_d3d_buffer.put()))); +#endif + +#if 0 + // debug begin + bufferHeapProps.Type = D3D12_HEAP_TYPE_READBACK; + winrt::check_hresult(d3d_device->CreateCommittedResource( + &bufferHeapProps, + D3D12_HEAP_FLAG_NONE, + &bufferDesc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(m_host_buffer.put()))); + + m_host_buffer->Map(0, nullptr, &m_host_ptr); + d3d_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_cmdallocator)); + d3d_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_cmdallocator.get(), nullptr, IID_PPV_ARGS(&m_cmdlist)); +#endif + + D3D12_COMMAND_QUEUE_DESC qd = {}; + qd.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + d3d_device->CreateCommandQueue(&qd, IID_PPV_ARGS(&m_cmd_queue)); + // debug end + + // create a shared handle to require to import the d3d buffer into CUDA + HANDLE sharedHandle; + WindowsSecurityAttributes windowsSecurityAttributes; + LPCWSTR name = NULL; +#if USE_BUFFER + d3d_device->CreateSharedHandle(m_d3d_buffer.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle); + + cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; +#else + d3d_device->CreateSharedHandle(m_d3d_heap.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle); + + cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap; +#endif + externalMemoryHandleDesc.handle.win32.handle = sharedHandle; + externalMemoryHandleDesc.size = bufferDesc.Width; + externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + auto result = cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc); + + CloseHandle(sharedHandle); + + // get pointer to external memory imported form d3d + cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {}; + externalMemoryBufferDesc.offset = 0; + externalMemoryBufferDesc.size = externalMemoryHandleDesc.size; + externalMemoryBufferDesc.flags = 0; + + result = cudaExternalMemoryGetMappedBuffer(&m_cuda_dev_ptr, m_externalMemory, &externalMemoryBufferDesc); + result = cudaDeviceSynchronize(); + + auto err = cudaMemset(m_cuda_dev_ptr, 255, 512*1024*1024); + result = cudaDeviceSynchronize(); + std::cout << "err: " << err << std::endl; + } + + ~InteropBufferImpl() { + auto result = cudaDestroyExternalMemory(m_externalMemory); + cudaFree(m_cuda_dev_ptr); + if (result != cudaSuccess) { + std::cout << "cudaDestroyExternalMemory interop buffer: " << result << std::endl; + } + } + + void* get_device_ptr() const { + return m_cuda_dev_ptr; + } + + ID3D12Resource* get_d3d_buffer() const { + return m_d3d_buffer.get(); + } + + void* get_host_ptr() const { +#if 0 + m_cmdlist->Reset(m_cmdallocator.get(), nullptr); + m_cmdlist->CopyResource(m_host_buffer.get(), m_d3d_buffer.get()); + m_cmdlist->Close(); + + ID3D12CommandList *ptr = m_cmdlist.get(); + m_cmd_queue->ExecuteCommandLists(1, &ptr); + Sleep(2); +#endif + + return m_host_ptr; + } + +private: + winrt::com_ptr m_cmd_queue = {}; + winrt::com_ptr m_d3d_buffer = {}; + winrt::com_ptr m_d3d_heap = {}; + + cudaExternalMemory_t m_externalMemory; + void* m_cuda_dev_ptr; + + // debug + winrt::com_ptr m_host_buffer = {}; + winrt::com_ptr m_cmdlist = {}; + winrt::com_ptr m_cmdallocator = {}; + void* m_host_ptr; +}; + +class DirectStorageCUDAImpl : public DirectStorageCUDA +{ +public: + DirectStorageCUDAImpl(int scratch_size, int number_of_scratch_spaces); + + virtual ~DirectStorageCUDAImpl() { + flush(true); + std::cout << "~DirectStorageCudaImpl" << std::endl; + } + + struct FileInfo { + const std::string& filename; + void* cuda_device_ptr; + size_t offset; + size_t size; + }; + + virtual std::unique_ptr create_interop_buffer(size_t size); + virtual DirectStorageCUDA::File openFile(std::string const& filename); + virtual void loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr); + virtual void loadFile(File const& file, size_t read_start, size_t read_len, InteropBuffer* interop_buffer, size_t interop_buffer_offset); + virtual void flush(bool last); +private: + class StagingArea + { + public: + StagingArea(winrt::com_ptr d3d_device, winrt::com_ptr d3d_factory, size_t chunk_size, size_t number_of_chunks) + : m_d3d_device(d3d_device) + , m_d3d_factory(d3d_factory) + , m_chunk_size(chunk_size) + , m_number_of_chunks(number_of_chunks) + , m_total_staging_space(chunk_size * number_of_chunks) + { + // Create a DirectStorage queue which will be used to load data into a + // buffer on the GPU. + DSTORAGE_QUEUE_DESC queueDesc{}; + queueDesc.Capacity = DSTORAGE_MAX_QUEUE_CAPACITY; + queueDesc.Priority = DSTORAGE_PRIORITY_NORMAL; + queueDesc.SourceType = DSTORAGE_REQUEST_SOURCE_FILE; + queueDesc.Device = m_d3d_device.get(); + + winrt::check_hresult(m_d3d_factory->CreateQueue(&queueDesc, IID_PPV_ARGS(m_d3d_storage_queue.put()))); + + // Create the ID3D12Resource buffer which will be used as temporary scratch space for d3d + // since it's not possible to import CUDA memory into DX. + D3D12_HEAP_PROPERTIES bufferHeapProps = {}; + bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT; + + D3D12_RESOURCE_DESC bufferDesc = {}; + bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + bufferDesc.Width = m_chunk_size * m_number_of_chunks; + bufferDesc.Height = 1; + bufferDesc.DepthOrArraySize = 1; + bufferDesc.MipLevels = 1; + bufferDesc.Format = DXGI_FORMAT_UNKNOWN; + bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + bufferDesc.SampleDesc.Count = 1; + + winrt::check_hresult(m_d3d_device->CreateCommittedResource( + &bufferHeapProps, + D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED, + &bufferDesc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(m_d3d_scratch_space.put()))); + + + // create a shared handle to require to import the d3d buffer into CUDA + HANDLE sharedHandle; + WindowsSecurityAttributes windowsSecurityAttributes; + LPCWSTR name = NULL; + m_d3d_device->CreateSharedHandle(m_d3d_scratch_space.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle); + + cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; + externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource; + externalMemoryHandleDesc.handle.win32.handle = sharedHandle; + externalMemoryHandleDesc.size = bufferDesc.Width; + externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; + auto result = cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc); + + CloseHandle(sharedHandle); + + // get pointer to external memory imported form d3d + cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {}; + externalMemoryBufferDesc.offset = 0; + externalMemoryBufferDesc.size = externalMemoryHandleDesc.size; + externalMemoryBufferDesc.flags = 0; + + result = cudaExternalMemoryGetMappedBuffer(&m_cuda_scratch_space, m_externalMemory, &externalMemoryBufferDesc); + + // create d3d fence for synchronization + auto resultDx = m_d3d_device->CreateFence(0, D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(&m_d3d_fence)); + + // import d3d fence as semaphore into CUDA. + cudaExternalSemaphoreHandleDesc extSemHandleDesc = {}; + extSemHandleDesc.type = cudaExternalSemaphoreHandleTypeD3D12Fence; + m_d3d_device->CreateSharedHandle(m_d3d_fence.get(), nullptr, GENERIC_ALL, nullptr, &extSemHandleDesc.handle.win32.handle); + result = cudaImportExternalSemaphore(&m_externalSemaphore, &extSemHandleDesc); + + cudaStreamCreate(&m_cudaStream); + + // intialize fence to wait for in flush + waitParams.params.fence.value = 1; + } + + ~StagingArea() + { + std::cout << "~StagingArea" << std::endl; + auto result = cudaDestroyExternalMemory(m_externalMemory); + cudaFree(m_cuda_scratch_space); + if (result != cudaSuccess) { + std::cout << "cudaDestroyExternalMemory interop buffer: " << result << std::endl; + } + // TODO ensure that no resources are being leaked + } + + // enqueue as much data as possible into the current staging area. + // enqueue will return true if all data has been enqueued, false otherwise. + // start, len and cuda_dev_ptr will be updated. + bool enqueue(DirectStorageCUDA::File const& file, size_t& read_start, size_t& len, void*& cuda_dst_ptr) + { + if (len == 0) + return false; + + m_enqueued = true; + + size_t memcpy_src_start = m_current_staging_offset; + + static size_t load_cnt = 0; + size_t read_end = read_start + len; + for (size_t src_start = read_start; src_start < read_end; src_start += m_chunk_size) + { + ++load_cnt; + size_t src_end = min(read_end, src_start + m_chunk_size); + size_t src_size = src_end - src_start; + + if (m_current_staging_offset + src_size >= m_total_staging_space) + { + //std::cout << load_cnt << std::endl; + load_cnt = 0; + + size_t processed_len = m_current_staging_offset - memcpy_src_start; + m_staging_memcpies.push_back(MemcpyOp(cuda_dst_ptr, (void*)((char*)m_cuda_scratch_space + memcpy_src_start), processed_len)); + + cuda_dst_ptr = reinterpret_cast(reinterpret_cast(cuda_dst_ptr) + processed_len); + read_start += processed_len; + len -= processed_len; + + flush(false); + + memcpy_src_start = m_current_staging_offset; + + return true; + } + + DSTORAGE_REQUEST request = {}; + request.Options.SourceType = DSTORAGE_REQUEST_SOURCE_FILE; + request.Options.DestinationType = DSTORAGE_REQUEST_DESTINATION_BUFFER; + request.Source.File.Source = static_cast(file.get())->get(); + request.Source.File.Offset = src_start; + request.Source.File.Size = src_size; // filesize + request.UncompressedSize = src_size; // filesize + request.Destination.Buffer.Resource = m_d3d_scratch_space.get(); + request.Destination.Buffer.Offset = m_current_staging_offset; + request.Destination.Buffer.Size = src_size; + + m_d3d_storage_queue->EnqueueRequest(&request); + + m_current_staging_offset += request.Destination.Buffer.Size; + } + + m_staging_memcpies.push_back(MemcpyOp((void*)((char*)cuda_dst_ptr), (void*)((char*)m_cuda_scratch_space + memcpy_src_start), m_current_staging_offset - memcpy_src_start)); + + size_t processed_len = m_current_staging_offset - memcpy_src_start; + cuda_dst_ptr = reinterpret_cast(reinterpret_cast(cuda_dst_ptr) + m_current_staging_offset - memcpy_src_start); + read_start += processed_len; + len -= processed_len; + + return false; + } + + void enqueue(DirectStorageCUDA::File const& file, size_t& read_start, size_t& read_len, InteropBuffer* interop_buffer, size_t interop_buffer_offset) + { + InteropBufferImpl* ibi = static_cast(interop_buffer); + bool flushed; + while (read_len) { + size_t request_size = min(m_chunk_size, read_len); + + DSTORAGE_REQUEST request = {}; + request.Options.SourceType = DSTORAGE_REQUEST_SOURCE_FILE; + request.Options.DestinationType = DSTORAGE_REQUEST_DESTINATION_BUFFER; + request.Source.File.Source = static_cast(file.get())->get(); + request.Source.File.Offset = read_start; + request.Source.File.Size = request_size; // filesize + request.UncompressedSize = request_size; // filesize + request.Destination.Buffer.Resource = ibi->get_d3d_buffer(); + request.Destination.Buffer.Offset = interop_buffer_offset; + request.Destination.Buffer.Size = request_size; + //std::cout << read_start / (1024*1024) << " / " << interop_buffer_offset / (1024 * 1024) << "/" << request_size / (1024 * 1024) << std::endl; + + m_d3d_storage_queue->EnqueueRequest(&request); + + read_len -= request_size; + interop_buffer_offset += request_size; + read_start += request_size; + + m_enqueued = true; + //flush(true); + }; + + } + + + void wait() + { + if (m_enqueued) { + cudaStreamSynchronize(m_cudaStream); + m_enqueued = false; + } + } + + void flush(bool last) + { + m_d3d_storage_queue->EnqueueSignal(m_d3d_fence.get(), waitParams.params.fence.value); + m_d3d_storage_queue->Submit(); + + nvtxRangePop(); + nvtxRangePush("wait"); + cudaWaitExternalSemaphoresAsync(&m_externalSemaphore, &waitParams, 1, m_cudaStream); + nvtxRangePop(); + nvtxRangePush("memcpy"); +#if 1 + for (auto const& op : m_staging_memcpies) { + auto result = cudaMemcpyAsync(op.m_dst, op.m_src, op.m_size, cudaMemcpyDeviceToDevice, m_cudaStream); + } +#endif + nvtxRangePop(); + nvtxRangePush("sync"); + //cudaStreamSynchronize(m_cudaStream); + nvtxRangePop(); + + // increase fence value by 1 for next flush call + waitParams.params.fence.value += 1; + + // reset staging area + m_staging_memcpies.clear(); + m_current_staging_offset = 0; + +#if 1 + if (last) { + DSTORAGE_ERROR_RECORD errorRecord{}; + m_d3d_storage_queue->RetrieveErrorRecord(&errorRecord); + if (FAILED(errorRecord.FirstFailure.HResult)) + { + // + // errorRecord.FailureCount - The number of failed requests in the queue since the last + // RetrieveErrorRecord call. + // errorRecord.FirstFailure - Detailed record about the first failed command in the enqueue order. + // + std::cout << "The DirectStorage request failed! HRESULT=0x" << std::hex << errorRecord.FirstFailure.HResult << std::endl; + } + } +#endif + } + + winrt::com_ptr m_d3d_device = {}; + winrt::com_ptr m_d3d_factory = {}; + winrt::com_ptr m_d3d_storage_queue = {}; + winrt::com_ptr m_d3d_scratch_space = {}; + winrt::com_ptr m_d3d_fence = {}; + + // cuda external memory resources + cudaExternalMemoryHandleType m_externalMemoryHandleType; + cudaExternalMemory_t m_externalMemory; + cudaExternalSemaphore_t m_externalSemaphore; + cudaExternalSemaphoreWaitParams waitParams = {}; + + size_t m_chunk_size; + size_t m_number_of_chunks; + size_t m_total_staging_space; + + cudaStream_t m_cudaStream; + void* m_cuda_scratch_space; + bool m_enqueued = false; // is any data enqueued + + // memcpy + size_t m_current_staging_offset = 0; // current offset in the staging buffer + + // memcpies from the staging buffer to the actual CUDA memory + struct MemcpyOp { + MemcpyOp(void* dst, void* src, size_t size) + : m_dst(dst), m_src(src), m_size(size) {} + void* m_dst; + void* m_src; + size_t m_size; + }; + std::vector m_staging_memcpies; + + }; + + winrt::com_ptr m_d3d_device = {}; + winrt::com_ptr m_d3d_factory = {}; + + size_t m_chunk_size; + size_t m_number_of_chunks; + + std::vector> m_staging_areas; + size_t m_staging_index = 0; +}; + +std::unique_ptr DirectStorageCUDA::create(int scratch_size, int number_of_scratch_spaces) +{ + return std::make_unique(scratch_size, number_of_scratch_spaces); +} + + // copy read_len bytes starting at read_start from the given file to the given cuda ptr +void DirectStorageCUDAImpl::loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr) +{ + bool flushed; + while (read_len) { + flushed = m_staging_areas[m_staging_index]->enqueue(file, read_start, read_len, cuda_dst_ptr); + if (flushed) { + m_staging_index = (m_staging_index + 1) % m_staging_areas.size(); + m_staging_areas[m_staging_index]->wait(); + } + }; +} + +void DirectStorageCUDAImpl::loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, InteropBuffer *interop_buffer, size_t interop_buffer_offset) +{ + if (!interop_buffer) + return; + + m_staging_areas[m_staging_index]->enqueue(file, read_start, read_len, interop_buffer, interop_buffer_offset); +} + + +void DirectStorageCUDAImpl::flush(bool last) +{ + for (auto& sa : m_staging_areas) { + sa->flush(last); + } + if (last) { + for (auto& sa : m_staging_areas) { + sa->wait(); + } + } +} + +DirectStorageCUDAImpl::DirectStorageCUDAImpl(int scratch_size, int number_of_scratch_spaces) + : m_chunk_size(scratch_size) + , m_number_of_chunks(number_of_scratch_spaces) +{ + DSTORAGE_CONFIGURATION direct_storage_config = {}; + direct_storage_config.NumSubmitThreads = 1; + DStorageSetConfiguration(&direct_storage_config); + + winrt::check_hresult(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_12_1, IID_PPV_ARGS(&m_d3d_device))); + winrt::check_hresult(DStorageGetFactory(IID_PPV_ARGS(m_d3d_factory.put()))); + + size_t num_staging_areas = 2; + for (size_t idx = 0; idx < num_staging_areas; ++idx) { + m_staging_areas.emplace_back(std::make_unique(m_d3d_device, m_d3d_factory, m_chunk_size, m_number_of_chunks)); + + } +} + +DirectStorageCUDAImpl::File DirectStorageCUDAImpl::openFile(std::string const& filename) +{ + File file = std::make_unique(); + std::wstring wfilename(filename.begin(), filename.end()); + nvtxRangePush("factory open file"); + HRESULT hr = m_d3d_factory->OpenFile(wfilename.c_str(), IID_PPV_ARGS(static_cast(file.get())->put())); + if (FAILED(hr)) + { + std::wcout << L"The file '" << wfilename << L"' could not be opened. HRESULT=0x" << std::hex << hr << std::endl; + return {}; + } + nvtxRangePop(); + return file; +} + +std::unique_ptr DirectStorageCUDAImpl::create_interop_buffer(size_t size) +{ + return std::make_unique(m_d3d_device, size); +} diff --git a/ggml-cuda/dsc.h b/ggml-cuda/dsc.h new file mode 100644 index 000000000..95a8430d5 --- /dev/null +++ b/ggml-cuda/dsc.h @@ -0,0 +1,56 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include +#include + +struct DirectStorageCUDAFileHandle { + virtual ~DirectStorageCUDAFileHandle() {}; +}; + +class InteropBuffer { +public: + virtual ~InteropBuffer() = 0; + virtual void* get_device_ptr() const = 0; + virtual void* get_host_ptr() const = 0; +}; + +class DirectStorageCUDA +{ +public: + virtual ~DirectStorageCUDA(); + + using File = std::unique_ptr; + + virtual std::unique_ptr create_interop_buffer(size_t size) = 0; + + virtual File openFile(std::string const& filename) = 0; + virtual void loadFile(File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr) = 0; + virtual void loadFile(File const& file, size_t read_start, size_t read_len, InteropBuffer *interop_buffer, size_t interop_buffer_offset) = 0; + virtual void flush(bool last = false) = 0; + + static std::unique_ptr create(int scratch_size, int number_of_scratch_spaces); +}; +