From f1571c96fcd5f11f7d60489d01db99e97932290b Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Fri, 19 Apr 2024 15:07:32 +0200 Subject: [PATCH] Add backdoor to ggml to use DirectStorage to load tensors. --- CMakeLists.txt | 39 ++++++++++++++++++++-------- ggml-backend.c | 3 ++- ggml-cuda.cu | 70 ++++++++++++++++++++++++++++++++++++++++++++++++-- llama.cpp | 51 ++++++++++++++++++++++++++++++++++-- 4 files changed, 148 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f134a153b..6b3b7ea42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,8 @@ option(LLAMA_BLAS "llama: use BLAS" option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUDA "llama: use CUDA" OFF) +option(LLAMA_CUDA_DIRECT_STORAGE "llama: use DirectStorage to upload tensors" OFF) +set(LLAMA_DIRECT_STORAGE_DIR "" CACHE PATH "llama: path to DirectStorage directory fetched with nuget. See https://devblogs.microsoft.com/directx/directstorage-api-downloads/" ) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) @@ -152,7 +154,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) # Compile flags # -if (LLAMA_SYCL) +if (LLAMA_SYCL OR LLAMA_CUDA_DIRECT_STORAGE) set(CMAKE_CXX_STANDARD 17) else() set(CMAKE_CXX_STANDARD 11) @@ -412,6 +414,15 @@ if (LLAMA_CUDA) file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") + if (LLAMA_CUDA_DIRECT_STORAGE) + file(GLOB GGML_SOURCES_CUDA_C "ggml-cuda/*.cpp") + file(GLOB GGML_SOURCES_CUDA_H "ggml-cuda/*.h") + list(APPEND GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA_C}) + list(APPEND GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA_H}) + + add_compile_definitions(GGML_ENABLE_DIRECT_STORAGE_CUDA) + endif() + add_compile_definitions(GGML_USE_CUDA) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) @@ -1172,15 +1183,15 @@ add_library(ggml OBJECT ggml-backend.h ggml-quants.c ggml-quants.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} - ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} - ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} - ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} - ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ) @@ -1198,6 +1209,14 @@ if (BUILD_SHARED_LIBS) install(TARGETS ggml_shared LIBRARY) endif() +if (LLAMA_CUDA_DIRECT_STORAGE) + set_property(TARGET ggml PROPERTY VS_PACKAGE_REFERENCES "Microsoft.Direct3D.DirectStorage_1.2.2") + + target_include_directories(ggml PRIVATE "${LLAMA_DIRECT_STORAGE_DIR}/native/include") + target_link_directories(ggml PRIVATE "${LLAMA_DIRECT_STORAGE_DIR}/native/lib/x64") + target_link_libraries(ggml PUBLIC "${LLAMA_DIRECT_STORAGE_DIR}/native/lib/x64/dstorage.lib" cuda cudart d3d12) +endif() + # llama add_library(llama diff --git a/ggml-backend.c b/ggml-backend.c index 402d86ef3..abf29ac09 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -223,7 +223,8 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + //GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(offset + (size & ~(1u << 31)) <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (!size) { return; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d277104d1..8e1207697 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -29,6 +29,7 @@ #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" #include "ggml-cuda/upscale.cuh" +#include "ggml-cuda/dsc.h" #include #include @@ -45,6 +46,8 @@ #include #include #include +#include +#include // debug static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -79,6 +82,10 @@ int ggml_cuda_get_device() { return id; } +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) +std::unique_ptr dsc; +#endif + static ggml_cuda_device_info ggml_cuda_init() { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: @@ -149,6 +156,10 @@ static ggml_cuda_device_info ggml_cuda_init() { // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + dsc = std::move(DirectStorageCUDA::create(8 * 1024 * 1024, 64)); +#endif + return info; } @@ -418,12 +429,67 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t } } +struct FileInfo { + std::vector handles; + size_t handle_idx = 0; + + DirectStorageCUDA::File& getFile() { + auto& temp = handles[handle_idx]; + ++handle_idx; + handle_idx %= handles.size(); + return temp; + } +}; + +std::map files; + GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + if (size & (1u << 31)) { + size &= ~(1u << 31); + if (data == nullptr) { + dsc->flush(); + return; + } + struct Temp { + const char* filename; + size_t weights_off; + }; + Temp* t = (Temp*)data; + + std::string filename = t->filename; + auto it = files.find(filename); + if (it == files.end()) { + files[filename].handles.push_back(dsc->openFile(filename)); + +#if 0 + // This is a hack to evaluate how fast data can be read from a 2nd disk. + std::filesystem::path p(filename); + std::filesystem::path p2("d:"); + p2 /= "\\lmcache"; + p2 /= p.filename().c_str(); + std::cout << p2.string() << std::endl; + if (std::filesystem::exists(p2)) { + std::cout << "opening " << p2.string() << std::endl; + files[filename].handles.push_back(dsc->openFile(p2.string().c_str())); + } + std::cout << "2nd file" << std::endl; +#endif + + it = files.find(filename); + } + + dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset); + } + else +#endif + { + CUDA_CHECK(cudaMemcpyAsync((char*)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + } } GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { diff --git a/llama.cpp b/llama.cpp index fa7c022f2..992337e0c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7,6 +7,9 @@ #include "ggml-alloc.h" #include "ggml-backend.h" +#include +#include + #ifdef GGML_USE_CUDA # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -1176,8 +1179,10 @@ struct llama_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; size_t size; + std::string filename; llama_file(const char * fname, const char * mode) { + filename = fname; fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -3459,7 +3464,9 @@ struct llama_model_loader { size_t size_data = 0; std::vector> mmaps_used; - // Returns false if cancelled by progress_callback + + + // Returns false if canceled by progress_callback bool load_all_data( struct ggml_context * ctx, llama_buf_map & bufs_mmap, @@ -3468,6 +3475,14 @@ struct llama_model_loader { void * progress_callback_user_data) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + struct ggml_tensor* last_tensor = nullptr; + + // debug statistics + size_t total_data_read = 0; + auto start = std::chrono::high_resolution_clock::now(); +#endif + std::vector> read_buf; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); @@ -3511,16 +3526,39 @@ struct llama_model_loader { file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, ggml_nbytes(cur)); } else { + +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + // backdoor to load tensors with DirectStorage + last_tensor = cur; + struct Temp { + const char* filename; + size_t weights_off; + }; + + Temp t; + t.filename = file->filename.c_str(); + t.weights_off = weight->offs; + + ggml_backend_tensor_set(cur, &t, 0, n_size | (1u << 31)); +#else read_buf.resize(ggml_nbytes(cur)); file->seek(weight->offs, SEEK_SET); file->read_raw(read_buf.data(), ggml_nbytes(cur)); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); +#endif + } } size_done += n_size; } +#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA) + // trigger flush of unread data + if (last_tensor) + ggml_backend_tensor_set(last_tensor, 0, 0, 1u << 31); +#endif + // check if this is the last call and do final cleanup if (size_done >= size_data) { // unmap offloaded tensors and metadata @@ -3541,6 +3579,14 @@ struct llama_model_loader { } } +#if defined(ENABLE_DIRECT_STORAGE_CUDA) + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration> delta(end - start); + //auto seconds = std::chrono::duration_cast(delta); + std::cout << "load time: " << delta.count() << std::endl;; +#endif + + return true; } }; @@ -5874,6 +5920,7 @@ static bool llm_load_tensors( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; + std::cout << "model load time: " << model.t_load_us / 1000.0f << "ms" << std::endl; return true; } @@ -14213,7 +14260,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // mmap consistently increases speed Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. -#if defined(__linux__) || defined(_WIN32) +#if false && defined(__linux__) || defined(_WIN32) constexpr bool use_mmap = true; #else constexpr bool use_mmap = false;