Implement non-mapped async IO for CUDA on Windows. On a fast Gen5 NVMe drive this change improves model load time by >3x while it should be the same (or slightly faster) on any other drive.

This commit is contained in:
Markus Tavenrath 2024-06-12 10:53:25 +02:00
parent 4bfe50f741
commit 32dd2ef133

161
llama.cpp
View file

@ -1274,6 +1274,122 @@ struct no_init {
};
struct llama_file {
#if defined(_WIN32)
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
HANDLE fp_win32;
size_t size;
private:
std::string GetErrorMessageWin32(DWORD error_code) const {
std::string ret;
LPSTR lpMsgBuf = NULL;
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
if (!bufLen) {
std::stringstream ss;
ret = format("Win32 error code: %s", error_code);
} else {
ret = lpMsgBuf;
LocalFree(lpMsgBuf);
}
return ret;
}
public:
llama_file(const char * fname, const char * mode) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
// SetFilePointerEx returns the current position when seeking relative 0 bytes
LARGE_INTEGER li;
li.QuadPart = 0;
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
GGML_ASSERT(ret);
return li.QuadPart;
}
void seek(size_t offset, int whence) const {
// no need to convert SEEK_* to FILE_*. The enums are the same.
// Still, keep static asserts to avoid failures in the future.
static_assert(SEEK_SET == FILE_BEGIN);
static_assert(SEEK_CUR == FILE_CURRENT);
static_assert(SEEK_END == FILE_END);
LARGE_INTEGER li;
li.QuadPart = offset;
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
GGML_ASSERT(ret);
}
void read_raw(void * ptr, size_t len) const {
// There are conditions under which ReadFile cannot read chunks >64MB.
// Thus split the operation into smaller chunks if len exceeds this limit.
size_t bytes_read = 0;
while (bytes_read < len) {
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
DWORD chunk_read = 0;
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
if (!result) {
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError())));
}
if (chunk_read < chunk_size || chunk_read == 0) {
throw std::runtime_error("unexpectedly reached end of file");
}
bytes_read += chunk_read;
} ;
}
uint32_t read_u32() const {
uint32_t val;
read_raw(&val, sizeof(val));
return val;
}
void write_raw(const void * ptr, size_t len) const {
// There are conditions under which WriteFile cannot write chunks >64MB.
// Thus split the operation into smaller chunks if len exceeds this limit.
size_t bytes_written = 0;
while (bytes_written < len) {
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
DWORD chunk_written = 0;
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
if (!result) {
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
}
if (chunk_written < chunk_size || chunk_written == 0) {
throw std::runtime_error("unexpectedly failed to write bytes");
}
bytes_written += chunk_written;
}
}
void write_u32(std::uint32_t val) const {
write_raw(&val, sizeof(val));
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
#else
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
@ -1347,6 +1463,7 @@ struct llama_file {
std::fclose(fp);
}
}
#endif
};
using llama_files = std::vector<std::unique_ptr<llama_file>>;
@ -3713,6 +3830,23 @@ struct llama_model_loader {
std::vector<no_init<uint8_t>> read_buf;
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
#if defined(GGML_USE_CUDA)
std::vector<ggml_backend_buffer_t> host_buffers;
std::vector<void*> host_ptrs;
std::vector<ggml_backend_event_t> events;
size_t buffer_idx = 0; // buffer to use for async loads
ggml_backend_t backend = ggml_backend_cuda_init(0); // TODO how to get the CUDA device / backend here?
constexpr size_t num_buffers = 4;
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
for (size_t idx = 0; idx < num_buffers; ++idx) {
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
events.emplace_back(ggml_backend_event_new(backend));
}
#endif
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
const auto * weight = get_weight(ggml_get_name(cur));
if (weight == nullptr) {
@ -3768,6 +3902,25 @@ struct llama_model_loader {
}));
}
} else {
#if defined(GGML_USE_CUDA)
file->seek(weight->offs, SEEK_SET);
size_t bytes_read = 0;
while (bytes_read < n_size)
{
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
ggml_backend_event_synchronize(events[buffer_idx]);
file->read_raw(host_ptrs[buffer_idx], read_iteration);
ggml_backend_tensor_set_async(backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
ggml_backend_event_record(events[buffer_idx]);
bytes_read += read_iteration;
++buffer_idx;
buffer_idx %= num_buffers;
}
#else
read_buf.resize(n_size);
file->seek(weight->offs, SEEK_SET);
file->read_raw(read_buf.data(), n_size);
@ -3775,12 +3928,20 @@ struct llama_model_loader {
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
}
#endif
}
}
size_done += n_size;
}
#if defined(GGML_USE_CUDA)
for (auto const& event : events) {
ggml_backend_event_synchronize(event);
}
#endif
// check validation results
bool validation_failed = false;
for (auto & future : validation_result) {