From 32dd2ef133e8a680653fd33c4fac7741c9a352fc Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Wed, 12 Jun 2024 10:53:25 +0200 Subject: [PATCH] Implement non-mapped async IO for CUDA on Windows. On a fast Gen5 NVMe drive this change improves model load time by >3x while it should be the same (or slightly faster) on any other drive. --- llama.cpp | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/llama.cpp b/llama.cpp index 8b675ea99..71f5b53ba 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1274,6 +1274,122 @@ struct no_init { }; struct llama_file { + +#if defined(_WIN32) + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + HANDLE fp_win32; + size_t size; + +private: + std::string GetErrorMessageWin32(DWORD error_code) const { + std::string ret; + LPSTR lpMsgBuf = NULL; + DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL); + if (!bufLen) { + std::stringstream ss; + ret = format("Win32 error code: %s", error_code); + } else { + ret = lpMsgBuf; + LocalFree(lpMsgBuf); + } + + return ret; + } + +public: + + llama_file(const char * fname, const char * mode) { + fp = ggml_fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp)); + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { + // SetFilePointerEx returns the current position when seeking relative 0 bytes + LARGE_INTEGER li; + li.QuadPart = 0; + BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT); + + GGML_ASSERT(ret); + + return li.QuadPart; + } + + void seek(size_t offset, int whence) const { + // no need to convert SEEK_* to FILE_*. The enums are the same. + // Still, keep static asserts to avoid failures in the future. + static_assert(SEEK_SET == FILE_BEGIN); + static_assert(SEEK_CUR == FILE_CURRENT); + static_assert(SEEK_END == FILE_END); + + LARGE_INTEGER li; + li.QuadPart = offset; + BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence); + + GGML_ASSERT(ret); + } + + void read_raw(void * ptr, size_t len) const { + // There are conditions under which ReadFile cannot read chunks >64MB. + // Thus split the operation into smaller chunks if len exceeds this limit. + size_t bytes_read = 0; + while (bytes_read < len) { + size_t chunk_size = std::min(len - bytes_read, 64*1024*1024); + DWORD chunk_read = 0; + BOOL result = ReadFile(fp_win32, reinterpret_cast(ptr) + bytes_read, chunk_size, &chunk_read, NULL); + if (!result) { + throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()))); + } + if (chunk_read < chunk_size || chunk_read == 0) { + throw std::runtime_error("unexpectedly reached end of file"); + } + + bytes_read += chunk_read; + } ; + } + + uint32_t read_u32() const { + uint32_t val; + read_raw(&val, sizeof(val)); + return val; + } + + void write_raw(const void * ptr, size_t len) const { + // There are conditions under which WriteFile cannot write chunks >64MB. + // Thus split the operation into smaller chunks if len exceeds this limit. + size_t bytes_written = 0; + while (bytes_written < len) { + size_t chunk_size = std::min(len - bytes_written, 64*1024*1024); + DWORD chunk_written = 0; + BOOL result = WriteFile(fp_win32, reinterpret_cast(ptr) + bytes_written, chunk_size, &chunk_written, NULL); + if (!result) { + throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str())); + } + if (chunk_written < chunk_size || chunk_written == 0) { + throw std::runtime_error("unexpectedly failed to write bytes"); + } + + bytes_written += chunk_written; + } + } + + void write_u32(std::uint32_t val) const { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +#else // use FILE * so we don't have to re-open the file to mmap FILE * fp; size_t size; @@ -1347,6 +1463,7 @@ struct llama_file { std::fclose(fp); } } +#endif }; using llama_files = std::vector>; @@ -3713,6 +3830,23 @@ struct llama_model_loader { std::vector> read_buf; std::vector>> validation_result; +#if defined(GGML_USE_CUDA) + std::vector host_buffers; + std::vector host_ptrs; + std::vector events; + size_t buffer_idx = 0; // buffer to use for async loads + + ggml_backend_t backend = ggml_backend_cuda_init(0); // TODO how to get the CUDA device / backend here? + + constexpr size_t num_buffers = 4; + constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB + for (size_t idx = 0; idx < num_buffers; ++idx) { + host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size)); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx])); + events.emplace_back(ggml_backend_event_new(backend)); + } +#endif + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -3768,6 +3902,25 @@ struct llama_model_loader { })); } } else { +#if defined(GGML_USE_CUDA) + file->seek(weight->offs, SEEK_SET); + + size_t bytes_read = 0; + + while (bytes_read < n_size) + { + size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); + ggml_backend_tensor_set_async(backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx]); + + bytes_read += read_iteration; + ++buffer_idx; + buffer_idx %= num_buffers; + } +#else read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), n_size); @@ -3775,12 +3928,20 @@ struct llama_model_loader { if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } +#endif } } size_done += n_size; } +#if defined(GGML_USE_CUDA) + for (auto const& event : events) { + ggml_backend_event_synchronize(event); + } +#endif + + // check validation results bool validation_failed = false; for (auto & future : validation_result) {