add missing DirectStorageCUDA files
This commit is contained in:
parent
f1571c96fc
commit
b11224c5e1
2 changed files with 708 additions and 0 deletions
652
ggml-cuda/dsc.cpp
Normal file
652
ggml-cuda/dsc.cpp
Normal file
|
@ -0,0 +1,652 @@
|
||||||
|
/*
|
||||||
|
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "dsc.h"
|
||||||
|
#include <dstorage.h>
|
||||||
|
#include <dxgi1_4.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <chrono>
|
||||||
|
#include <iostream>
|
||||||
|
#include <winrt/base.h>
|
||||||
|
#include <windows.h>
|
||||||
|
#include <sstream>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_d3d11_interop.h>
|
||||||
|
#include <Aclapi.h> // WindowsSecurityAttributes
|
||||||
|
#include <queue>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
#include <nvtx3/nvToolsExt.h>
|
||||||
|
|
||||||
|
|
||||||
|
class WindowsSecurityAttributes
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
SECURITY_ATTRIBUTES m_winSecurityAttributes = {};
|
||||||
|
SECURITY_DESCRIPTOR m_securityDescriptor = {};
|
||||||
|
PSID pSID = 0;
|
||||||
|
PACL pACL = 0;
|
||||||
|
|
||||||
|
public:
|
||||||
|
WindowsSecurityAttributes()
|
||||||
|
{
|
||||||
|
InitializeSecurityDescriptor(&m_securityDescriptor, SECURITY_DESCRIPTOR_REVISION);
|
||||||
|
|
||||||
|
SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY;
|
||||||
|
AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, &pSID);
|
||||||
|
|
||||||
|
EXPLICIT_ACCESS explicitAccess = {};
|
||||||
|
explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
|
||||||
|
explicitAccess.grfAccessMode = SET_ACCESS;
|
||||||
|
explicitAccess.grfInheritance = INHERIT_ONLY;
|
||||||
|
explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID;
|
||||||
|
explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP;
|
||||||
|
explicitAccess.Trustee.ptstrName = reinterpret_cast<LPTSTR>(pSID);
|
||||||
|
|
||||||
|
SetEntriesInAcl(1, &explicitAccess, NULL, &pACL);
|
||||||
|
SetSecurityDescriptorDacl(&m_securityDescriptor, TRUE, pACL, FALSE);
|
||||||
|
|
||||||
|
m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes);
|
||||||
|
m_winSecurityAttributes.lpSecurityDescriptor = &m_securityDescriptor;
|
||||||
|
m_winSecurityAttributes.bInheritHandle = TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
WindowsSecurityAttributes(WindowsSecurityAttributes const& rhs) = delete;
|
||||||
|
WindowsSecurityAttributes(WindowsSecurityAttributes const&& rhs) = delete;
|
||||||
|
|
||||||
|
~WindowsSecurityAttributes() {
|
||||||
|
if (pSID)
|
||||||
|
{
|
||||||
|
FreeSid(pSID);
|
||||||
|
}
|
||||||
|
if (pACL)
|
||||||
|
{
|
||||||
|
LocalFree(pACL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
operator SECURITY_ATTRIBUTES const* () const {
|
||||||
|
return &m_winSecurityAttributes;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
DirectStorageCUDA::~DirectStorageCUDA()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DirectStorageCUDAFileHandleImpl : DirectStorageCUDAFileHandle
|
||||||
|
{
|
||||||
|
~DirectStorageCUDAFileHandleImpl() {};
|
||||||
|
|
||||||
|
using File = winrt::com_ptr<IDStorageFile>;
|
||||||
|
File file;
|
||||||
|
|
||||||
|
IDStorageFile* get() { return file.get(); }
|
||||||
|
IDStorageFile** put() { return file.put(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
InteropBuffer::~InteropBuffer()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
class InteropBufferImpl : public InteropBuffer
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
InteropBufferImpl(winrt::com_ptr<ID3D12Device> const& d3d_device, size_t size)
|
||||||
|
{
|
||||||
|
|
||||||
|
// Create the ID3D12Resource buffer which will be used as temporary scratch space for d3d
|
||||||
|
// since it's not possible to import CUDA memory into DX.
|
||||||
|
D3D12_HEAP_PROPERTIES bufferHeapProps = {};
|
||||||
|
bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
|
||||||
|
|
||||||
|
|
||||||
|
D3D12_HEAP_DESC hd = {};
|
||||||
|
hd.SizeInBytes = size;
|
||||||
|
hd.Properties = bufferHeapProps;
|
||||||
|
hd.Flags = D3D12_HEAP_FLAG_SHARED;
|
||||||
|
hd.Alignment = 0;
|
||||||
|
d3d_device->CreateHeap(&hd, IID_PPV_ARGS(&m_d3d_heap));
|
||||||
|
|
||||||
|
|
||||||
|
D3D12_RESOURCE_DESC bufferDesc = {};
|
||||||
|
bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
|
||||||
|
bufferDesc.Width = size;
|
||||||
|
bufferDesc.Height = 1;
|
||||||
|
bufferDesc.DepthOrArraySize = 1;
|
||||||
|
bufferDesc.MipLevels = 1;
|
||||||
|
bufferDesc.Format = DXGI_FORMAT_UNKNOWN;
|
||||||
|
bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
|
||||||
|
bufferDesc.SampleDesc.Count = 1;
|
||||||
|
|
||||||
|
//#define USE_BUFFER
|
||||||
|
#if defined(USE_BUFFER) //
|
||||||
|
winrt::check_hresult(d3d_device->CreateCommittedResource(
|
||||||
|
&bufferHeapProps,
|
||||||
|
D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED,
|
||||||
|
&bufferDesc,
|
||||||
|
D3D12_RESOURCE_STATE_COMMON,
|
||||||
|
nullptr,
|
||||||
|
IID_PPV_ARGS(m_d3d_buffer.put())));
|
||||||
|
#else
|
||||||
|
winrt::check_hresult(d3d_device->CreatePlacedResource(
|
||||||
|
m_d3d_heap.get(),
|
||||||
|
0,
|
||||||
|
&bufferDesc,
|
||||||
|
D3D12_RESOURCE_STATE_COMMON,
|
||||||
|
nullptr,
|
||||||
|
IID_PPV_ARGS(m_d3d_buffer.put())));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// debug begin
|
||||||
|
bufferHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
|
||||||
|
winrt::check_hresult(d3d_device->CreateCommittedResource(
|
||||||
|
&bufferHeapProps,
|
||||||
|
D3D12_HEAP_FLAG_NONE,
|
||||||
|
&bufferDesc,
|
||||||
|
D3D12_RESOURCE_STATE_COMMON,
|
||||||
|
nullptr,
|
||||||
|
IID_PPV_ARGS(m_host_buffer.put())));
|
||||||
|
|
||||||
|
m_host_buffer->Map(0, nullptr, &m_host_ptr);
|
||||||
|
d3d_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_cmdallocator));
|
||||||
|
d3d_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_cmdallocator.get(), nullptr, IID_PPV_ARGS(&m_cmdlist));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
D3D12_COMMAND_QUEUE_DESC qd = {};
|
||||||
|
qd.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
|
||||||
|
d3d_device->CreateCommandQueue(&qd, IID_PPV_ARGS(&m_cmd_queue));
|
||||||
|
// debug end
|
||||||
|
|
||||||
|
// create a shared handle to require to import the d3d buffer into CUDA
|
||||||
|
HANDLE sharedHandle;
|
||||||
|
WindowsSecurityAttributes windowsSecurityAttributes;
|
||||||
|
LPCWSTR name = NULL;
|
||||||
|
#if USE_BUFFER
|
||||||
|
d3d_device->CreateSharedHandle(m_d3d_buffer.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
|
||||||
|
|
||||||
|
cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
|
||||||
|
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
|
||||||
|
#else
|
||||||
|
d3d_device->CreateSharedHandle(m_d3d_heap.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
|
||||||
|
|
||||||
|
cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
|
||||||
|
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap;
|
||||||
|
#endif
|
||||||
|
externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
|
||||||
|
externalMemoryHandleDesc.size = bufferDesc.Width;
|
||||||
|
externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
|
||||||
|
auto result = cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc);
|
||||||
|
|
||||||
|
CloseHandle(sharedHandle);
|
||||||
|
|
||||||
|
// get pointer to external memory imported form d3d
|
||||||
|
cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {};
|
||||||
|
externalMemoryBufferDesc.offset = 0;
|
||||||
|
externalMemoryBufferDesc.size = externalMemoryHandleDesc.size;
|
||||||
|
externalMemoryBufferDesc.flags = 0;
|
||||||
|
|
||||||
|
result = cudaExternalMemoryGetMappedBuffer(&m_cuda_dev_ptr, m_externalMemory, &externalMemoryBufferDesc);
|
||||||
|
result = cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
auto err = cudaMemset(m_cuda_dev_ptr, 255, 512*1024*1024);
|
||||||
|
result = cudaDeviceSynchronize();
|
||||||
|
std::cout << "err: " << err << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
~InteropBufferImpl() {
|
||||||
|
auto result = cudaDestroyExternalMemory(m_externalMemory);
|
||||||
|
cudaFree(m_cuda_dev_ptr);
|
||||||
|
if (result != cudaSuccess) {
|
||||||
|
std::cout << "cudaDestroyExternalMemory interop buffer: " << result << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* get_device_ptr() const {
|
||||||
|
return m_cuda_dev_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
ID3D12Resource* get_d3d_buffer() const {
|
||||||
|
return m_d3d_buffer.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
void* get_host_ptr() const {
|
||||||
|
#if 0
|
||||||
|
m_cmdlist->Reset(m_cmdallocator.get(), nullptr);
|
||||||
|
m_cmdlist->CopyResource(m_host_buffer.get(), m_d3d_buffer.get());
|
||||||
|
m_cmdlist->Close();
|
||||||
|
|
||||||
|
ID3D12CommandList *ptr = m_cmdlist.get();
|
||||||
|
m_cmd_queue->ExecuteCommandLists(1, &ptr);
|
||||||
|
Sleep(2);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return m_host_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
winrt::com_ptr<ID3D12CommandQueue> m_cmd_queue = {};
|
||||||
|
winrt::com_ptr<ID3D12Resource> m_d3d_buffer = {};
|
||||||
|
winrt::com_ptr<ID3D12Heap> m_d3d_heap = {};
|
||||||
|
|
||||||
|
cudaExternalMemory_t m_externalMemory;
|
||||||
|
void* m_cuda_dev_ptr;
|
||||||
|
|
||||||
|
// debug
|
||||||
|
winrt::com_ptr<ID3D12Resource> m_host_buffer = {};
|
||||||
|
winrt::com_ptr<ID3D12GraphicsCommandList> m_cmdlist = {};
|
||||||
|
winrt::com_ptr<ID3D12CommandAllocator> m_cmdallocator = {};
|
||||||
|
void* m_host_ptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
class DirectStorageCUDAImpl : public DirectStorageCUDA
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
DirectStorageCUDAImpl(int scratch_size, int number_of_scratch_spaces);
|
||||||
|
|
||||||
|
virtual ~DirectStorageCUDAImpl() {
|
||||||
|
flush(true);
|
||||||
|
std::cout << "~DirectStorageCudaImpl" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FileInfo {
|
||||||
|
const std::string& filename;
|
||||||
|
void* cuda_device_ptr;
|
||||||
|
size_t offset;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
virtual std::unique_ptr<InteropBuffer> create_interop_buffer(size_t size);
|
||||||
|
virtual DirectStorageCUDA::File openFile(std::string const& filename);
|
||||||
|
virtual void loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr);
|
||||||
|
virtual void loadFile(File const& file, size_t read_start, size_t read_len, InteropBuffer* interop_buffer, size_t interop_buffer_offset);
|
||||||
|
virtual void flush(bool last);
|
||||||
|
private:
|
||||||
|
class StagingArea
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
StagingArea(winrt::com_ptr<ID3D12Device> d3d_device, winrt::com_ptr<IDStorageFactory> d3d_factory, size_t chunk_size, size_t number_of_chunks)
|
||||||
|
: m_d3d_device(d3d_device)
|
||||||
|
, m_d3d_factory(d3d_factory)
|
||||||
|
, m_chunk_size(chunk_size)
|
||||||
|
, m_number_of_chunks(number_of_chunks)
|
||||||
|
, m_total_staging_space(chunk_size * number_of_chunks)
|
||||||
|
{
|
||||||
|
// Create a DirectStorage queue which will be used to load data into a
|
||||||
|
// buffer on the GPU.
|
||||||
|
DSTORAGE_QUEUE_DESC queueDesc{};
|
||||||
|
queueDesc.Capacity = DSTORAGE_MAX_QUEUE_CAPACITY;
|
||||||
|
queueDesc.Priority = DSTORAGE_PRIORITY_NORMAL;
|
||||||
|
queueDesc.SourceType = DSTORAGE_REQUEST_SOURCE_FILE;
|
||||||
|
queueDesc.Device = m_d3d_device.get();
|
||||||
|
|
||||||
|
winrt::check_hresult(m_d3d_factory->CreateQueue(&queueDesc, IID_PPV_ARGS(m_d3d_storage_queue.put())));
|
||||||
|
|
||||||
|
// Create the ID3D12Resource buffer which will be used as temporary scratch space for d3d
|
||||||
|
// since it's not possible to import CUDA memory into DX.
|
||||||
|
D3D12_HEAP_PROPERTIES bufferHeapProps = {};
|
||||||
|
bufferHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
|
||||||
|
|
||||||
|
D3D12_RESOURCE_DESC bufferDesc = {};
|
||||||
|
bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
|
||||||
|
bufferDesc.Width = m_chunk_size * m_number_of_chunks;
|
||||||
|
bufferDesc.Height = 1;
|
||||||
|
bufferDesc.DepthOrArraySize = 1;
|
||||||
|
bufferDesc.MipLevels = 1;
|
||||||
|
bufferDesc.Format = DXGI_FORMAT_UNKNOWN;
|
||||||
|
bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
|
||||||
|
bufferDesc.SampleDesc.Count = 1;
|
||||||
|
|
||||||
|
winrt::check_hresult(m_d3d_device->CreateCommittedResource(
|
||||||
|
&bufferHeapProps,
|
||||||
|
D3D12_HEAP_FLAG_NONE | D3D12_HEAP_FLAG_SHARED,
|
||||||
|
&bufferDesc,
|
||||||
|
D3D12_RESOURCE_STATE_COMMON,
|
||||||
|
nullptr,
|
||||||
|
IID_PPV_ARGS(m_d3d_scratch_space.put())));
|
||||||
|
|
||||||
|
|
||||||
|
// create a shared handle to require to import the d3d buffer into CUDA
|
||||||
|
HANDLE sharedHandle;
|
||||||
|
WindowsSecurityAttributes windowsSecurityAttributes;
|
||||||
|
LPCWSTR name = NULL;
|
||||||
|
m_d3d_device->CreateSharedHandle(m_d3d_scratch_space.get(), windowsSecurityAttributes, GENERIC_ALL, name, &sharedHandle);
|
||||||
|
|
||||||
|
cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
|
||||||
|
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Resource;
|
||||||
|
externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
|
||||||
|
externalMemoryHandleDesc.size = bufferDesc.Width;
|
||||||
|
externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated;
|
||||||
|
auto result = cudaImportExternalMemory(&m_externalMemory, &externalMemoryHandleDesc);
|
||||||
|
|
||||||
|
CloseHandle(sharedHandle);
|
||||||
|
|
||||||
|
// get pointer to external memory imported form d3d
|
||||||
|
cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {};
|
||||||
|
externalMemoryBufferDesc.offset = 0;
|
||||||
|
externalMemoryBufferDesc.size = externalMemoryHandleDesc.size;
|
||||||
|
externalMemoryBufferDesc.flags = 0;
|
||||||
|
|
||||||
|
result = cudaExternalMemoryGetMappedBuffer(&m_cuda_scratch_space, m_externalMemory, &externalMemoryBufferDesc);
|
||||||
|
|
||||||
|
// create d3d fence for synchronization
|
||||||
|
auto resultDx = m_d3d_device->CreateFence(0, D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(&m_d3d_fence));
|
||||||
|
|
||||||
|
// import d3d fence as semaphore into CUDA.
|
||||||
|
cudaExternalSemaphoreHandleDesc extSemHandleDesc = {};
|
||||||
|
extSemHandleDesc.type = cudaExternalSemaphoreHandleTypeD3D12Fence;
|
||||||
|
m_d3d_device->CreateSharedHandle(m_d3d_fence.get(), nullptr, GENERIC_ALL, nullptr, &extSemHandleDesc.handle.win32.handle);
|
||||||
|
result = cudaImportExternalSemaphore(&m_externalSemaphore, &extSemHandleDesc);
|
||||||
|
|
||||||
|
cudaStreamCreate(&m_cudaStream);
|
||||||
|
|
||||||
|
// intialize fence to wait for in flush
|
||||||
|
waitParams.params.fence.value = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
~StagingArea()
|
||||||
|
{
|
||||||
|
std::cout << "~StagingArea" << std::endl;
|
||||||
|
auto result = cudaDestroyExternalMemory(m_externalMemory);
|
||||||
|
cudaFree(m_cuda_scratch_space);
|
||||||
|
if (result != cudaSuccess) {
|
||||||
|
std::cout << "cudaDestroyExternalMemory interop buffer: " << result << std::endl;
|
||||||
|
}
|
||||||
|
// TODO ensure that no resources are being leaked
|
||||||
|
}
|
||||||
|
|
||||||
|
// enqueue as much data as possible into the current staging area.
|
||||||
|
// enqueue will return true if all data has been enqueued, false otherwise.
|
||||||
|
// start, len and cuda_dev_ptr will be updated.
|
||||||
|
bool enqueue(DirectStorageCUDA::File const& file, size_t& read_start, size_t& len, void*& cuda_dst_ptr)
|
||||||
|
{
|
||||||
|
if (len == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
m_enqueued = true;
|
||||||
|
|
||||||
|
size_t memcpy_src_start = m_current_staging_offset;
|
||||||
|
|
||||||
|
static size_t load_cnt = 0;
|
||||||
|
size_t read_end = read_start + len;
|
||||||
|
for (size_t src_start = read_start; src_start < read_end; src_start += m_chunk_size)
|
||||||
|
{
|
||||||
|
++load_cnt;
|
||||||
|
size_t src_end = min(read_end, src_start + m_chunk_size);
|
||||||
|
size_t src_size = src_end - src_start;
|
||||||
|
|
||||||
|
if (m_current_staging_offset + src_size >= m_total_staging_space)
|
||||||
|
{
|
||||||
|
//std::cout << load_cnt << std::endl;
|
||||||
|
load_cnt = 0;
|
||||||
|
|
||||||
|
size_t processed_len = m_current_staging_offset - memcpy_src_start;
|
||||||
|
m_staging_memcpies.push_back(MemcpyOp(cuda_dst_ptr, (void*)((char*)m_cuda_scratch_space + memcpy_src_start), processed_len));
|
||||||
|
|
||||||
|
cuda_dst_ptr = reinterpret_cast<void*>(reinterpret_cast<char*>(cuda_dst_ptr) + processed_len);
|
||||||
|
read_start += processed_len;
|
||||||
|
len -= processed_len;
|
||||||
|
|
||||||
|
flush(false);
|
||||||
|
|
||||||
|
memcpy_src_start = m_current_staging_offset;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DSTORAGE_REQUEST request = {};
|
||||||
|
request.Options.SourceType = DSTORAGE_REQUEST_SOURCE_FILE;
|
||||||
|
request.Options.DestinationType = DSTORAGE_REQUEST_DESTINATION_BUFFER;
|
||||||
|
request.Source.File.Source = static_cast<DirectStorageCUDAFileHandleImpl*>(file.get())->get();
|
||||||
|
request.Source.File.Offset = src_start;
|
||||||
|
request.Source.File.Size = src_size; // filesize
|
||||||
|
request.UncompressedSize = src_size; // filesize
|
||||||
|
request.Destination.Buffer.Resource = m_d3d_scratch_space.get();
|
||||||
|
request.Destination.Buffer.Offset = m_current_staging_offset;
|
||||||
|
request.Destination.Buffer.Size = src_size;
|
||||||
|
|
||||||
|
m_d3d_storage_queue->EnqueueRequest(&request);
|
||||||
|
|
||||||
|
m_current_staging_offset += request.Destination.Buffer.Size;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_staging_memcpies.push_back(MemcpyOp((void*)((char*)cuda_dst_ptr), (void*)((char*)m_cuda_scratch_space + memcpy_src_start), m_current_staging_offset - memcpy_src_start));
|
||||||
|
|
||||||
|
size_t processed_len = m_current_staging_offset - memcpy_src_start;
|
||||||
|
cuda_dst_ptr = reinterpret_cast<void*>(reinterpret_cast<char*>(cuda_dst_ptr) + m_current_staging_offset - memcpy_src_start);
|
||||||
|
read_start += processed_len;
|
||||||
|
len -= processed_len;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void enqueue(DirectStorageCUDA::File const& file, size_t& read_start, size_t& read_len, InteropBuffer* interop_buffer, size_t interop_buffer_offset)
|
||||||
|
{
|
||||||
|
InteropBufferImpl* ibi = static_cast<InteropBufferImpl*>(interop_buffer);
|
||||||
|
bool flushed;
|
||||||
|
while (read_len) {
|
||||||
|
size_t request_size = min(m_chunk_size, read_len);
|
||||||
|
|
||||||
|
DSTORAGE_REQUEST request = {};
|
||||||
|
request.Options.SourceType = DSTORAGE_REQUEST_SOURCE_FILE;
|
||||||
|
request.Options.DestinationType = DSTORAGE_REQUEST_DESTINATION_BUFFER;
|
||||||
|
request.Source.File.Source = static_cast<DirectStorageCUDAFileHandleImpl*>(file.get())->get();
|
||||||
|
request.Source.File.Offset = read_start;
|
||||||
|
request.Source.File.Size = request_size; // filesize
|
||||||
|
request.UncompressedSize = request_size; // filesize
|
||||||
|
request.Destination.Buffer.Resource = ibi->get_d3d_buffer();
|
||||||
|
request.Destination.Buffer.Offset = interop_buffer_offset;
|
||||||
|
request.Destination.Buffer.Size = request_size;
|
||||||
|
//std::cout << read_start / (1024*1024) << " / " << interop_buffer_offset / (1024 * 1024) << "/" << request_size / (1024 * 1024) << std::endl;
|
||||||
|
|
||||||
|
m_d3d_storage_queue->EnqueueRequest(&request);
|
||||||
|
|
||||||
|
read_len -= request_size;
|
||||||
|
interop_buffer_offset += request_size;
|
||||||
|
read_start += request_size;
|
||||||
|
|
||||||
|
m_enqueued = true;
|
||||||
|
//flush(true);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void wait()
|
||||||
|
{
|
||||||
|
if (m_enqueued) {
|
||||||
|
cudaStreamSynchronize(m_cudaStream);
|
||||||
|
m_enqueued = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void flush(bool last)
|
||||||
|
{
|
||||||
|
m_d3d_storage_queue->EnqueueSignal(m_d3d_fence.get(), waitParams.params.fence.value);
|
||||||
|
m_d3d_storage_queue->Submit();
|
||||||
|
|
||||||
|
nvtxRangePop();
|
||||||
|
nvtxRangePush("wait");
|
||||||
|
cudaWaitExternalSemaphoresAsync(&m_externalSemaphore, &waitParams, 1, m_cudaStream);
|
||||||
|
nvtxRangePop();
|
||||||
|
nvtxRangePush("memcpy");
|
||||||
|
#if 1
|
||||||
|
for (auto const& op : m_staging_memcpies) {
|
||||||
|
auto result = cudaMemcpyAsync(op.m_dst, op.m_src, op.m_size, cudaMemcpyDeviceToDevice, m_cudaStream);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
nvtxRangePop();
|
||||||
|
nvtxRangePush("sync");
|
||||||
|
//cudaStreamSynchronize(m_cudaStream);
|
||||||
|
nvtxRangePop();
|
||||||
|
|
||||||
|
// increase fence value by 1 for next flush call
|
||||||
|
waitParams.params.fence.value += 1;
|
||||||
|
|
||||||
|
// reset staging area
|
||||||
|
m_staging_memcpies.clear();
|
||||||
|
m_current_staging_offset = 0;
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
if (last) {
|
||||||
|
DSTORAGE_ERROR_RECORD errorRecord{};
|
||||||
|
m_d3d_storage_queue->RetrieveErrorRecord(&errorRecord);
|
||||||
|
if (FAILED(errorRecord.FirstFailure.HResult))
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// errorRecord.FailureCount - The number of failed requests in the queue since the last
|
||||||
|
// RetrieveErrorRecord call.
|
||||||
|
// errorRecord.FirstFailure - Detailed record about the first failed command in the enqueue order.
|
||||||
|
//
|
||||||
|
std::cout << "The DirectStorage request failed! HRESULT=0x" << std::hex << errorRecord.FirstFailure.HResult << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D12Device> m_d3d_device = {};
|
||||||
|
winrt::com_ptr<IDStorageFactory> m_d3d_factory = {};
|
||||||
|
winrt::com_ptr<IDStorageQueue> m_d3d_storage_queue = {};
|
||||||
|
winrt::com_ptr<ID3D12Resource> m_d3d_scratch_space = {};
|
||||||
|
winrt::com_ptr<ID3D12Fence> m_d3d_fence = {};
|
||||||
|
|
||||||
|
// cuda external memory resources
|
||||||
|
cudaExternalMemoryHandleType m_externalMemoryHandleType;
|
||||||
|
cudaExternalMemory_t m_externalMemory;
|
||||||
|
cudaExternalSemaphore_t m_externalSemaphore;
|
||||||
|
cudaExternalSemaphoreWaitParams waitParams = {};
|
||||||
|
|
||||||
|
size_t m_chunk_size;
|
||||||
|
size_t m_number_of_chunks;
|
||||||
|
size_t m_total_staging_space;
|
||||||
|
|
||||||
|
cudaStream_t m_cudaStream;
|
||||||
|
void* m_cuda_scratch_space;
|
||||||
|
bool m_enqueued = false; // is any data enqueued
|
||||||
|
|
||||||
|
// memcpy
|
||||||
|
size_t m_current_staging_offset = 0; // current offset in the staging buffer
|
||||||
|
|
||||||
|
// memcpies from the staging buffer to the actual CUDA memory
|
||||||
|
struct MemcpyOp {
|
||||||
|
MemcpyOp(void* dst, void* src, size_t size)
|
||||||
|
: m_dst(dst), m_src(src), m_size(size) {}
|
||||||
|
void* m_dst;
|
||||||
|
void* m_src;
|
||||||
|
size_t m_size;
|
||||||
|
};
|
||||||
|
std::vector<MemcpyOp> m_staging_memcpies;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
winrt::com_ptr<ID3D12Device> m_d3d_device = {};
|
||||||
|
winrt::com_ptr<IDStorageFactory> m_d3d_factory = {};
|
||||||
|
|
||||||
|
size_t m_chunk_size;
|
||||||
|
size_t m_number_of_chunks;
|
||||||
|
|
||||||
|
std::vector<std::unique_ptr<StagingArea>> m_staging_areas;
|
||||||
|
size_t m_staging_index = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<DirectStorageCUDA> DirectStorageCUDA::create(int scratch_size, int number_of_scratch_spaces)
|
||||||
|
{
|
||||||
|
return std::make_unique<DirectStorageCUDAImpl>(scratch_size, number_of_scratch_spaces);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy read_len bytes starting at read_start from the given file to the given cuda ptr
|
||||||
|
void DirectStorageCUDAImpl::loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr)
|
||||||
|
{
|
||||||
|
bool flushed;
|
||||||
|
while (read_len) {
|
||||||
|
flushed = m_staging_areas[m_staging_index]->enqueue(file, read_start, read_len, cuda_dst_ptr);
|
||||||
|
if (flushed) {
|
||||||
|
m_staging_index = (m_staging_index + 1) % m_staging_areas.size();
|
||||||
|
m_staging_areas[m_staging_index]->wait();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
void DirectStorageCUDAImpl::loadFile(DirectStorageCUDA::File const& file, size_t read_start, size_t read_len, InteropBuffer *interop_buffer, size_t interop_buffer_offset)
|
||||||
|
{
|
||||||
|
if (!interop_buffer)
|
||||||
|
return;
|
||||||
|
|
||||||
|
m_staging_areas[m_staging_index]->enqueue(file, read_start, read_len, interop_buffer, interop_buffer_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void DirectStorageCUDAImpl::flush(bool last)
|
||||||
|
{
|
||||||
|
for (auto& sa : m_staging_areas) {
|
||||||
|
sa->flush(last);
|
||||||
|
}
|
||||||
|
if (last) {
|
||||||
|
for (auto& sa : m_staging_areas) {
|
||||||
|
sa->wait();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DirectStorageCUDAImpl::DirectStorageCUDAImpl(int scratch_size, int number_of_scratch_spaces)
|
||||||
|
: m_chunk_size(scratch_size)
|
||||||
|
, m_number_of_chunks(number_of_scratch_spaces)
|
||||||
|
{
|
||||||
|
DSTORAGE_CONFIGURATION direct_storage_config = {};
|
||||||
|
direct_storage_config.NumSubmitThreads = 1;
|
||||||
|
DStorageSetConfiguration(&direct_storage_config);
|
||||||
|
|
||||||
|
winrt::check_hresult(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_12_1, IID_PPV_ARGS(&m_d3d_device)));
|
||||||
|
winrt::check_hresult(DStorageGetFactory(IID_PPV_ARGS(m_d3d_factory.put())));
|
||||||
|
|
||||||
|
size_t num_staging_areas = 2;
|
||||||
|
for (size_t idx = 0; idx < num_staging_areas; ++idx) {
|
||||||
|
m_staging_areas.emplace_back(std::make_unique<StagingArea>(m_d3d_device, m_d3d_factory, m_chunk_size, m_number_of_chunks));
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DirectStorageCUDAImpl::File DirectStorageCUDAImpl::openFile(std::string const& filename)
|
||||||
|
{
|
||||||
|
File file = std::make_unique<DirectStorageCUDAFileHandleImpl>();
|
||||||
|
std::wstring wfilename(filename.begin(), filename.end());
|
||||||
|
nvtxRangePush("factory open file");
|
||||||
|
HRESULT hr = m_d3d_factory->OpenFile(wfilename.c_str(), IID_PPV_ARGS(static_cast<DirectStorageCUDAFileHandleImpl*>(file.get())->put()));
|
||||||
|
if (FAILED(hr))
|
||||||
|
{
|
||||||
|
std::wcout << L"The file '" << wfilename << L"' could not be opened. HRESULT=0x" << std::hex << hr << std::endl;
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
nvtxRangePop();
|
||||||
|
return file;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<InteropBuffer> DirectStorageCUDAImpl::create_interop_buffer(size_t size)
|
||||||
|
{
|
||||||
|
return std::make_unique<InteropBufferImpl>(m_d3d_device, size);
|
||||||
|
}
|
56
ggml-cuda/dsc.h
Normal file
56
ggml-cuda/dsc.h
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
/*
|
||||||
|
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
struct DirectStorageCUDAFileHandle {
|
||||||
|
virtual ~DirectStorageCUDAFileHandle() {};
|
||||||
|
};
|
||||||
|
|
||||||
|
class InteropBuffer {
|
||||||
|
public:
|
||||||
|
virtual ~InteropBuffer() = 0;
|
||||||
|
virtual void* get_device_ptr() const = 0;
|
||||||
|
virtual void* get_host_ptr() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class DirectStorageCUDA
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
virtual ~DirectStorageCUDA();
|
||||||
|
|
||||||
|
using File = std::unique_ptr<DirectStorageCUDAFileHandle>;
|
||||||
|
|
||||||
|
virtual std::unique_ptr<InteropBuffer> create_interop_buffer(size_t size) = 0;
|
||||||
|
|
||||||
|
virtual File openFile(std::string const& filename) = 0;
|
||||||
|
virtual void loadFile(File const& file, size_t read_start, size_t read_len, void* cuda_dst_ptr) = 0;
|
||||||
|
virtual void loadFile(File const& file, size_t read_start, size_t read_len, InteropBuffer *interop_buffer, size_t interop_buffer_offset) = 0;
|
||||||
|
virtual void flush(bool last = false) = 0;
|
||||||
|
|
||||||
|
static std::unique_ptr<DirectStorageCUDA> create(int scratch_size, int number_of_scratch_spaces);
|
||||||
|
};
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue