add missing DirectStorageCUDA files

This commit is contained in:
Markus Tavenrath 2024-04-23 09:37:28 +02:00
parent f1571c96fc
commit b11224c5e1
2 changed files with 708 additions and 0 deletions

652
ggml-cuda/dsc.cpp Normal file
View file

@ -0,0 +1,652 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "dsc.h"
#include <dstorage.h>
#include <dxgi1_4.h>
#include <iostream>
#include <filesystem>
#include <fstream>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <winrt/base.h>
#include <windows.h>
#include <sstream>
#include <cuda.h>
#include <cuda_d3d11_interop.h>
#include <Aclapi.h> // WindowsSecurityAttributes
#include <queue>
#include <condition_variable>
#include <mutex>
#include <nvtx3/nvToolsExt.h>
class WindowsSecurityAttributes
{
protected:
SECURITY_ATTRIBUTES m_winSecurityAttributes = {};
SECURITY_DESCRIPTOR m_securityDescriptor = {};
PSID pSID = 0;
PACL pACL = 0;
public:
WindowsSecurityAttributes()
{
InitializeSecurityDescriptor(&m_securityDescriptor, SECURITY_DESCRIPTOR_REVISION);
SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY;
AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, &pSID);
EXPLICIT_ACCESS explicitAccess = {};
explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
explicitAccess.grfAccessMode = SET_ACCESS;
explicitAccess.grfInheritance = INHERIT_ONLY;
explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
explicitAccess.Trustee.ptstrName = reinterpret_cast<LPTSTR>(pSID);
SetEntriesInAcl(1, &explicitAccess, NULL, &pACL);
SetSecurityDescriptorDacl(&m_securityDescriptor, TRUE, pACL, FALSE);
m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
m_winSecurityAttributes.lpSecurityDescriptor = &m_securityDescriptor;
m_winSecurityAttributes.bInheritHandle = TRUE;
}
WindowsSecurityAttributes(WindowsSecurityAttributes const& rhs) = delete;
WindowsSecurityAttributes(WindowsSecurityAttributes const&& rhs) = delete;
~WindowsSecurityAttributes() {
if (pSID)
{
FreeSid(pSID);
}
if (pACL)
{
LocalFree(pACL);
}
}
operator SECURITY_ATTRIBUTES const* () const {
return &m_winSecurityAttributes;
}
};
DirectStorageCUDA::~DirectStorageCUDA()
{
}
struct DirectStorageCUDAFileHandleImpl : DirectStorageCUDAFileHandle
{
~DirectStorageCUDAFileHandleImpl() {};
using File = winrt::com_ptr<IDStorageFile>;
File file;
IDStorageFile* get() { return file.get(); }
IDStorageFile** put() { return file.put(); }
};
InteropBuffer::~InteropBuffer()
{
}
class InteropBufferImpl : public InteropBuffer
{
public:
InteropBufferImpl(winrt::com_ptr<ID3D12Device> const& d3d_device, size_t size)
{
// Create the ID3D12Resource buffer which will be used as temporary scratch space for d3d
// since it's not possible to import CUDA memory into DX.
D3D12_HEAP_PROPERTIES bufferHeapProps = {};
bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
D3D12_HEAP_DESC hd = {};
hd.SizeInBytes = size;
hd.Properties = bufferHeapProps;
hd.Flags = D3D12_HEAP_FLAG_SHARED;
hd.Alignment = 0;
d3d_device->CreateHeap(&hd, IID_PPV_ARGS(&m_d3d_heap));
D3D12_RESOURCE_DESC bufferDesc = {};
bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
bufferDesc.Width = size;
bufferDesc.Height = 1;
bufferDesc.DepthOrArraySize = 1;
bufferDesc.MipLevels = 1;
bufferDesc.Format = DXGI_FORMAT_UNKNOWN;
bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
bufferDesc.SampleDesc.Count = 1;
//#define USE_BUFFER
#if defined(USE_BUFFER) //
winrt::check_hresult(d3d_device->CreateCommittedResource(
&bufferHeapProps,
D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED,
&bufferDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(m_d3d_buffer.put())));
#else
winrt::check_hresult(d3d_device->CreatePlacedResource(
m_d3d_heap.get(),
0,
&bufferDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(m_d3d_buffer.put())));
#endif
#if 0
// debug begin
bufferHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
winrt::check_hresult(d3d_device->CreateCommittedResource(
&bufferHeapProps,
D3D12_HEAP_FLAG_NONE,
&bufferDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(m_host_buffer.put())));
m_host_buffer->Map(0, nullptr, &m_host_ptr);
d3d_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_cmdallocator));
d3d_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_cmdallocator.get(), nullptr, IID_PPV_ARGS(&m_cmdlist));
#endif
D3D12_COMMAND_QUEUE_DESC qd = {};
qd.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
d3d_device->CreateCommandQueue(&qd, IID_PPV_ARGS(&m_cmd_queue));
// debug end
// create a shared handle to require to import the d3d buffer into CUDA
HANDLE sharedHandle;
WindowsSecurityAttributes windowsSecurityAttributes;
LPCWSTR name = NULL;
#if USE_BUFFER
d3d_device->CreateSharedHandle(m_d3d_buffer.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
#else
d3d_device->CreateSharedHandle(m_d3d_heap.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap;
#endif
externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
externalMemoryHandleDesc.size = bufferDesc.Width;
externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
auto result = cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc);
CloseHandle(sharedHandle);
// get pointer to external memory imported form d3d
cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {};
externalMemoryBufferDesc.offset = 0;
externalMemoryBufferDesc.size = externalMemoryHandleDesc.size;
externalMemoryBufferDesc.flags = 0;
result = cudaExternalMemoryGetMappedBuffer(&m_cuda_dev_ptr, m_externalMemory, &externalMemoryBufferDesc);
result = cudaDeviceSynchronize();
auto err = cudaMemset(m_cuda_dev_ptr, 255, 512*1024*1024);
result = cudaDeviceSynchronize();
std::cout << "err: " << err << std::endl;
}
~InteropBufferImpl() {
auto result = cudaDestroyExternalMemory(m_externalMemory);
cudaFree(m_cuda_dev_ptr);
if (result != cudaSuccess) {
std::cout << "cudaDestroyExternalMemory interop buffer: " << result << std::endl;
}
}
void* get_device_ptr() const {
return m_cuda_dev_ptr;
}
ID3D12Resource* get_d3d_buffer() const {
return m_d3d_buffer.get();
}
void* get_host_ptr() const {
#if 0
m_cmdlist->Reset(m_cmdallocator.get(), nullptr);
m_cmdlist->CopyResource(m_host_buffer.get(), m_d3d_buffer.get());
m_cmdlist->Close();
ID3D12CommandList *ptr = m_cmdlist.get();
m_cmd_queue->ExecuteCommandLists(1, &ptr);
Sleep(2);
#endif
return m_host_ptr;
}
private:
winrt::com_ptr<ID3D12CommandQueue> m_cmd_queue = {};
winrt::com_ptr<ID3D12Resource> m_d3d_buffer = {};
winrt::com_ptr<ID3D12Heap> m_d3d_heap = {};
cudaExternalMemory_t m_externalMemory;
void* m_cuda_dev_ptr;
// debug
winrt::com_ptr<ID3D12Resource> m_host_buffer = {};
winrt::com_ptr<ID3D12GraphicsCommandList> m_cmdlist = {};
winrt::com_ptr<ID3D12CommandAllocator> m_cmdallocator = {};
void* m_host_ptr;
};
class DirectStorageCUDAImpl : public DirectStorageCUDA
{
public:
DirectStorageCUDAImpl(int scratch_size, int number_of_scratch_spaces);
virtual ~DirectStorageCUDAImpl() {
flush(true);
std::cout << "~DirectStorageCudaImpl" << std::endl;
}
struct FileInfo {
const std::string& filename;
void* cuda_device_ptr;
size_t offset;
size_t size;
};
virtual std::unique_ptr<InteropBuffer> create_interop_buffer(size_t size);
virtual DirectStorageCUDA::File openFile(std::string const& filename);
virtual void loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr);
virtual void loadFile(File const& file, size_t read_start, size_t read_len, InteropBuffer* interop_buffer, size_t interop_buffer_offset);
virtual void flush(bool last);
private:
class StagingArea
{
public:
StagingArea(winrt::com_ptr<ID3D12Device> d3d_device, winrt::com_ptr<IDStorageFactory> d3d_factory, size_t chunk_size, size_t number_of_chunks)
: m_d3d_device(d3d_device)
, m_d3d_factory(d3d_factory)
, m_chunk_size(chunk_size)
, m_number_of_chunks(number_of_chunks)
, m_total_staging_space(chunk_size * number_of_chunks)
{
// Create a DirectStorage queue which will be used to load data into a
// buffer on the GPU.
DSTORAGE_QUEUE_DESC queueDesc{};
queueDesc.Capacity = DSTORAGE_MAX_QUEUE_CAPACITY;
queueDesc.Priority = DSTORAGE_PRIORITY_NORMAL;
queueDesc.SourceType = DSTORAGE_REQUEST_SOURCE_FILE;
queueDesc.Device = m_d3d_device.get();
winrt::check_hresult(m_d3d_factory->CreateQueue(&queueDesc, IID_PPV_ARGS(m_d3d_storage_queue.put())));
// Create the ID3D12Resource buffer which will be used as temporary scratch space for d3d
// since it's not possible to import CUDA memory into DX.
D3D12_HEAP_PROPERTIES bufferHeapProps = {};
bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
D3D12_RESOURCE_DESC bufferDesc = {};
bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
bufferDesc.Width = m_chunk_size * m_number_of_chunks;
bufferDesc.Height = 1;
bufferDesc.DepthOrArraySize = 1;
bufferDesc.MipLevels = 1;
bufferDesc.Format = DXGI_FORMAT_UNKNOWN;
bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
bufferDesc.SampleDesc.Count = 1;
winrt::check_hresult(m_d3d_device->CreateCommittedResource(
&bufferHeapProps,
D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED,
&bufferDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(m_d3d_scratch_space.put())));
// create a shared handle to require to import the d3d buffer into CUDA
HANDLE sharedHandle;
WindowsSecurityAttributes windowsSecurityAttributes;
LPCWSTR name = NULL;
m_d3d_device->CreateSharedHandle(m_d3d_scratch_space.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
externalMemoryHandleDesc.size = bufferDesc.Width;
externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
auto result = cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc);
CloseHandle(sharedHandle);
// get pointer to external memory imported form d3d
cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {};
externalMemoryBufferDesc.offset = 0;
externalMemoryBufferDesc.size = externalMemoryHandleDesc.size;
externalMemoryBufferDesc.flags = 0;
result = cudaExternalMemoryGetMappedBuffer(&m_cuda_scratch_space, m_externalMemory, &externalMemoryBufferDesc);
// create d3d fence for synchronization
auto resultDx = m_d3d_device->CreateFence(0, D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(&m_d3d_fence));
// import d3d fence as semaphore into CUDA.
cudaExternalSemaphoreHandleDesc extSemHandleDesc = {};
extSemHandleDesc.type = cudaExternalSemaphoreHandleTypeD3D12Fence;
m_d3d_device->CreateSharedHandle(m_d3d_fence.get(), nullptr, GENERIC_ALL, nullptr, &extSemHandleDesc.handle.win32.handle);
result = cudaImportExternalSemaphore(&m_externalSemaphore, &extSemHandleDesc);
cudaStreamCreate(&m_cudaStream);
// intialize fence to wait for in flush
waitParams.params.fence.value = 1;
}
~StagingArea()
{
std::cout << "~StagingArea" << std::endl;
auto result = cudaDestroyExternalMemory(m_externalMemory);
cudaFree(m_cuda_scratch_space);
if (result != cudaSuccess) {
std::cout << "cudaDestroyExternalMemory interop buffer: " << result << std::endl;
}
// TODO ensure that no resources are being leaked
}
// enqueue as much data as possible into the current staging area.
// enqueue will return true if all data has been enqueued, false otherwise.
// start, len and cuda_dev_ptr will be updated.
bool enqueue(DirectStorageCUDA::File const& file, size_t& read_start, size_t& len, void*& cuda_dst_ptr)
{
if (len == 0)
return false;
m_enqueued = true;
size_t memcpy_src_start = m_current_staging_offset;
static size_t load_cnt = 0;
size_t read_end = read_start + len;
for (size_t src_start = read_start; src_start < read_end; src_start += m_chunk_size)
{
++load_cnt;
size_t src_end = min(read_end, src_start + m_chunk_size);
size_t src_size = src_end - src_start;
if (m_current_staging_offset + src_size >= m_total_staging_space)
{
//std::cout << load_cnt << std::endl;
load_cnt = 0;
size_t processed_len = m_current_staging_offset - memcpy_src_start;
m_staging_memcpies.push_back(MemcpyOp(cuda_dst_ptr, (void*)((char*)m_cuda_scratch_space + memcpy_src_start), processed_len));
cuda_dst_ptr = reinterpret_cast<void*>(reinterpret_cast<char*>(cuda_dst_ptr) + processed_len);
read_start += processed_len;
len -= processed_len;
flush(false);
memcpy_src_start = m_current_staging_offset;
return true;
}
DSTORAGE_REQUEST request = {};
request.Options.SourceType = DSTORAGE_REQUEST_SOURCE_FILE;
request.Options.DestinationType = DSTORAGE_REQUEST_DESTINATION_BUFFER;
request.Source.File.Source = static_cast<DirectStorageCUDAFileHandleImpl*>(file.get())->get();
request.Source.File.Offset = src_start;
request.Source.File.Size = src_size; // filesize
request.UncompressedSize = src_size; // filesize
request.Destination.Buffer.Resource = m_d3d_scratch_space.get();
request.Destination.Buffer.Offset = m_current_staging_offset;
request.Destination.Buffer.Size = src_size;
m_d3d_storage_queue->EnqueueRequest(&request);
m_current_staging_offset += request.Destination.Buffer.Size;
}
m_staging_memcpies.push_back(MemcpyOp((void*)((char*)cuda_dst_ptr), (void*)((char*)m_cuda_scratch_space + memcpy_src_start), m_current_staging_offset - memcpy_src_start));
size_t processed_len = m_current_staging_offset - memcpy_src_start;
cuda_dst_ptr = reinterpret_cast<void*>(reinterpret_cast<char*>(cuda_dst_ptr) + m_current_staging_offset - memcpy_src_start);
read_start += processed_len;
len -= processed_len;
return false;
}
void enqueue(DirectStorageCUDA::File const& file, size_t& read_start, size_t& read_len, InteropBuffer* interop_buffer, size_t interop_buffer_offset)
{
InteropBufferImpl* ibi = static_cast<InteropBufferImpl*>(interop_buffer);
bool flushed;
while (read_len) {
size_t request_size = min(m_chunk_size, read_len);
DSTORAGE_REQUEST request = {};
request.Options.SourceType = DSTORAGE_REQUEST_SOURCE_FILE;
request.Options.DestinationType = DSTORAGE_REQUEST_DESTINATION_BUFFER;
request.Source.File.Source = static_cast<DirectStorageCUDAFileHandleImpl*>(file.get())->get();
request.Source.File.Offset = read_start;
request.Source.File.Size = request_size; // filesize
request.UncompressedSize = request_size; // filesize
request.Destination.Buffer.Resource = ibi->get_d3d_buffer();
request.Destination.Buffer.Offset = interop_buffer_offset;
request.Destination.Buffer.Size = request_size;
//std::cout << read_start / (1024*1024) << " / " << interop_buffer_offset / (1024 * 1024) << "/" << request_size / (1024 * 1024) << std::endl;
m_d3d_storage_queue->EnqueueRequest(&request);
read_len -= request_size;
interop_buffer_offset += request_size;
read_start += request_size;
m_enqueued = true;
//flush(true);
};
}
void wait()
{
if (m_enqueued) {
cudaStreamSynchronize(m_cudaStream);
m_enqueued = false;
}
}
void flush(bool last)
{
m_d3d_storage_queue->EnqueueSignal(m_d3d_fence.get(), waitParams.params.fence.value);
m_d3d_storage_queue->Submit();
nvtxRangePop();
nvtxRangePush("wait");
cudaWaitExternalSemaphoresAsync(&m_externalSemaphore, &waitParams, 1, m_cudaStream);
nvtxRangePop();
nvtxRangePush("memcpy");
#if 1
for (auto const& op : m_staging_memcpies) {
auto result = cudaMemcpyAsync(op.m_dst, op.m_src, op.m_size, cudaMemcpyDeviceToDevice, m_cudaStream);
}
#endif
nvtxRangePop();
nvtxRangePush("sync");
//cudaStreamSynchronize(m_cudaStream);
nvtxRangePop();
// increase fence value by 1 for next flush call
waitParams.params.fence.value += 1;
// reset staging area
m_staging_memcpies.clear();
m_current_staging_offset = 0;
#if 1
if (last) {
DSTORAGE_ERROR_RECORD errorRecord{};
m_d3d_storage_queue->RetrieveErrorRecord(&errorRecord);
if (FAILED(errorRecord.FirstFailure.HResult))
{
//
// errorRecord.FailureCount - The number of failed requests in the queue since the last
// RetrieveErrorRecord call.
// errorRecord.FirstFailure - Detailed record about the first failed command in the enqueue order.
//
std::cout << "The DirectStorage request failed! HRESULT=0x" << std::hex << errorRecord.FirstFailure.HResult << std::endl;
}
}
#endif
}
winrt::com_ptr<ID3D12Device> m_d3d_device = {};
winrt::com_ptr<IDStorageFactory> m_d3d_factory = {};
winrt::com_ptr<IDStorageQueue> m_d3d_storage_queue = {};
winrt::com_ptr<ID3D12Resource> m_d3d_scratch_space = {};
winrt::com_ptr<ID3D12Fence> m_d3d_fence = {};
// cuda external memory resources
cudaExternalMemoryHandleType m_externalMemoryHandleType;
cudaExternalMemory_t m_externalMemory;
cudaExternalSemaphore_t m_externalSemaphore;
cudaExternalSemaphoreWaitParams waitParams = {};
size_t m_chunk_size;
size_t m_number_of_chunks;
size_t m_total_staging_space;
cudaStream_t m_cudaStream;
void* m_cuda_scratch_space;
bool m_enqueued = false; // is any data enqueued
// memcpy
size_t m_current_staging_offset = 0; // current offset in the staging buffer
// memcpies from the staging buffer to the actual CUDA memory
struct MemcpyOp {
MemcpyOp(void* dst, void* src, size_t size)
: m_dst(dst), m_src(src), m_size(size) {}
void* m_dst;
void* m_src;
size_t m_size;
};
std::vector<MemcpyOp> m_staging_memcpies;
};
winrt::com_ptr<ID3D12Device> m_d3d_device = {};
winrt::com_ptr<IDStorageFactory> m_d3d_factory = {};
size_t m_chunk_size;
size_t m_number_of_chunks;
std::vector<std::unique_ptr<StagingArea>> m_staging_areas;
size_t m_staging_index = 0;
};
std::unique_ptr<DirectStorageCUDA> DirectStorageCUDA::create(int scratch_size, int number_of_scratch_spaces)
{
return std::make_unique<DirectStorageCUDAImpl>(scratch_size, number_of_scratch_spaces);
}
// copy read_len bytes starting at read_start from the given file to the given cuda ptr
void DirectStorageCUDAImpl::loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr)
{
bool flushed;
while (read_len) {
flushed = m_staging_areas[m_staging_index]->enqueue(file, read_start, read_len, cuda_dst_ptr);
if (flushed) {
m_staging_index = (m_staging_index + 1) % m_staging_areas.size();
m_staging_areas[m_staging_index]->wait();
}
};
}
void DirectStorageCUDAImpl::loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, InteropBuffer *interop_buffer, size_t interop_buffer_offset)
{
if (!interop_buffer)
return;
m_staging_areas[m_staging_index]->enqueue(file, read_start, read_len, interop_buffer, interop_buffer_offset);
}
void DirectStorageCUDAImpl::flush(bool last)
{
for (auto& sa : m_staging_areas) {
sa->flush(last);
}
if (last) {
for (auto& sa : m_staging_areas) {
sa->wait();
}
}
}
DirectStorageCUDAImpl::DirectStorageCUDAImpl(int scratch_size, int number_of_scratch_spaces)
: m_chunk_size(scratch_size)
, m_number_of_chunks(number_of_scratch_spaces)
{
DSTORAGE_CONFIGURATION direct_storage_config = {};
direct_storage_config.NumSubmitThreads = 1;
DStorageSetConfiguration(&direct_storage_config);
winrt::check_hresult(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_12_1, IID_PPV_ARGS(&m_d3d_device)));
winrt::check_hresult(DStorageGetFactory(IID_PPV_ARGS(m_d3d_factory.put())));
size_t num_staging_areas = 2;
for (size_t idx = 0; idx < num_staging_areas; ++idx) {
m_staging_areas.emplace_back(std::make_unique<StagingArea>(m_d3d_device, m_d3d_factory, m_chunk_size, m_number_of_chunks));
}
}
DirectStorageCUDAImpl::File DirectStorageCUDAImpl::openFile(std::string const& filename)
{
File file = std::make_unique<DirectStorageCUDAFileHandleImpl>();
std::wstring wfilename(filename.begin(), filename.end());
nvtxRangePush("factory open file");
HRESULT hr = m_d3d_factory->OpenFile(wfilename.c_str(), IID_PPV_ARGS(static_cast<DirectStorageCUDAFileHandleImpl*>(file.get())->put()));
if (FAILED(hr))
{
std::wcout << L"The file '" << wfilename << L"' could not be opened. HRESULT=0x" << std::hex << hr << std::endl;
return {};
}
nvtxRangePop();
return file;
}
std::unique_ptr<InteropBuffer> DirectStorageCUDAImpl::create_interop_buffer(size_t size)
{
return std::make_unique<InteropBufferImpl>(m_d3d_device, size);
}

56
ggml-cuda/dsc.h Normal file
View file

@ -0,0 +1,56 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <memory>
#include <string>
struct DirectStorageCUDAFileHandle {
virtual ~DirectStorageCUDAFileHandle() {};
};
class InteropBuffer {
public:
virtual ~InteropBuffer() = 0;
virtual void* get_device_ptr() const = 0;
virtual void* get_host_ptr() const = 0;
};
class DirectStorageCUDA
{
public:
virtual ~DirectStorageCUDA();
using File = std::unique_ptr<DirectStorageCUDAFileHandle>;
virtual std::unique_ptr<InteropBuffer> create_interop_buffer(size_t size) = 0;
virtual File openFile(std::string const& filename) = 0;
virtual void loadFile(File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr) = 0;
virtual void loadFile(File const& file, size_t read_start, size_t read_len, InteropBuffer *interop_buffer, size_t interop_buffer_offset) = 0;
virtual void flush(bool last = false) = 0;
static std::unique_ptr<DirectStorageCUDA> create(int scratch_size, int number_of_scratch_spaces);
};