Add backdoor to ggml to use DirectStorage to load tensors.
This commit is contained in:
parent
637e9a86c2
commit
f1571c96fc
4 changed files with 148 additions and 15 deletions
|
@ -103,6 +103,8 @@ option(LLAMA_BLAS "llama: use BLAS"
|
||||||
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
|
option(LLAMA_CUDA_DIRECT_STORAGE "llama: use DirectStorage to upload tensors" OFF)
|
||||||
|
set(LLAMA_DIRECT_STORAGE_DIR "" CACHE PATH "llama: path to DirectStorage directory fetched with nuget. See https://devblogs.microsoft.com/directx/directstorage-api-downloads/" )
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||||
|
@ -152,7 +154,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
|
||||||
if (LLAMA_SYCL)
|
if (LLAMA_SYCL OR LLAMA_CUDA_DIRECT_STORAGE)
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CXX_STANDARD 11)
|
set(CMAKE_CXX_STANDARD 11)
|
||||||
|
@ -412,6 +414,15 @@ if (LLAMA_CUDA)
|
||||||
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
|
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
|
||||||
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
||||||
|
|
||||||
|
if (LLAMA_CUDA_DIRECT_STORAGE)
|
||||||
|
file(GLOB GGML_SOURCES_CUDA_C "ggml-cuda/*.cpp")
|
||||||
|
file(GLOB GGML_SOURCES_CUDA_H "ggml-cuda/*.h")
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA_C})
|
||||||
|
list(APPEND GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA_H})
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUDA)
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
|
@ -1172,15 +1183,15 @@ add_library(ggml OBJECT
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1198,6 +1209,14 @@ if (BUILD_SHARED_LIBS)
|
||||||
install(TARGETS ggml_shared LIBRARY)
|
install(TARGETS ggml_shared LIBRARY)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUDA_DIRECT_STORAGE)
|
||||||
|
set_property(TARGET ggml PROPERTY VS_PACKAGE_REFERENCES "Microsoft.Direct3D.DirectStorage_1.2.2")
|
||||||
|
|
||||||
|
target_include_directories(ggml PRIVATE "${LLAMA_DIRECT_STORAGE_DIR}/native/include")
|
||||||
|
target_link_directories(ggml PRIVATE "${LLAMA_DIRECT_STORAGE_DIR}/native/lib/x64")
|
||||||
|
target_link_libraries(ggml PUBLIC "${LLAMA_DIRECT_STORAGE_DIR}/native/lib/x64/dstorage.lib" cuda cudart d3d12)
|
||||||
|
endif()
|
||||||
|
|
||||||
# llama
|
# llama
|
||||||
|
|
||||||
add_library(llama
|
add_library(llama
|
||||||
|
|
|
@ -223,7 +223,8 @@ GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void *
|
||||||
|
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
//GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
GGML_ASSERT(offset + (size & ~(1u << 31)) <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
|
||||||
if (!size) {
|
if (!size) {
|
||||||
return;
|
return;
|
||||||
|
|
70
ggml-cuda.cu
70
ggml-cuda.cu
|
@ -29,6 +29,7 @@
|
||||||
#include "ggml-cuda/tsembd.cuh"
|
#include "ggml-cuda/tsembd.cuh"
|
||||||
#include "ggml-cuda/unary.cuh"
|
#include "ggml-cuda/unary.cuh"
|
||||||
#include "ggml-cuda/upscale.cuh"
|
#include "ggml-cuda/upscale.cuh"
|
||||||
|
#include "ggml-cuda/dsc.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
|
@ -45,6 +46,8 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <iostream> // debug
|
||||||
|
|
||||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
|
@ -79,6 +82,10 @@ int ggml_cuda_get_device() {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
|
std::unique_ptr<DirectStorageCUDA> dsc;
|
||||||
|
#endif
|
||||||
|
|
||||||
static ggml_cuda_device_info ggml_cuda_init() {
|
static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||||
|
@ -149,6 +156,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
// configure logging to stdout
|
// configure logging to stdout
|
||||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||||
|
|
||||||
|
#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
|
dsc = std::move(DirectStorageCUDA::create(8 * 1024 * 1024, 64));
|
||||||
|
#endif
|
||||||
|
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -418,12 +429,67 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct FileInfo {
|
||||||
|
std::vector<DirectStorageCUDA::File> handles;
|
||||||
|
size_t handle_idx = 0;
|
||||||
|
|
||||||
|
DirectStorageCUDA::File& getFile() {
|
||||||
|
auto& temp = handles[handle_idx];
|
||||||
|
++handle_idx;
|
||||||
|
handle_idx %= handles.size();
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::map<std::string, FileInfo> files;
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
if (size & (1u << 31)) {
|
||||||
|
size &= ~(1u << 31);
|
||||||
|
if (data == nullptr) {
|
||||||
|
dsc->flush();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
struct Temp {
|
||||||
|
const char* filename;
|
||||||
|
size_t weights_off;
|
||||||
|
};
|
||||||
|
Temp* t = (Temp*)data;
|
||||||
|
|
||||||
|
std::string filename = t->filename;
|
||||||
|
auto it = files.find(filename);
|
||||||
|
if (it == files.end()) {
|
||||||
|
files[filename].handles.push_back(dsc->openFile(filename));
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// This is a hack to evaluate how fast data can be read from a 2nd disk.
|
||||||
|
std::filesystem::path p(filename);
|
||||||
|
std::filesystem::path p2("d:");
|
||||||
|
p2 /= "\\lmcache";
|
||||||
|
p2 /= p.filename().c_str();
|
||||||
|
std::cout << p2.string() << std::endl;
|
||||||
|
if (std::filesystem::exists(p2)) {
|
||||||
|
std::cout << "opening " << p2.string() << std::endl;
|
||||||
|
files[filename].handles.push_back(dsc->openFile(p2.string().c_str()));
|
||||||
|
}
|
||||||
|
std::cout << "2nd file" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
it = files.find(filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
dsc->loadFile(it->second.getFile(), t->weights_off, size, (char*)tensor->data + offset);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync((char*)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
|
51
llama.cpp
51
llama.cpp
|
@ -7,6 +7,9 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
# include "ggml-cuda.h"
|
# include "ggml-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
@ -1176,8 +1179,10 @@ struct llama_file {
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
// use FILE * so we don't have to re-open the file to mmap
|
||||||
FILE * fp;
|
FILE * fp;
|
||||||
size_t size;
|
size_t size;
|
||||||
|
std::string filename;
|
||||||
|
|
||||||
llama_file(const char * fname, const char * mode) {
|
llama_file(const char * fname, const char * mode) {
|
||||||
|
filename = fname;
|
||||||
fp = ggml_fopen(fname, mode);
|
fp = ggml_fopen(fname, mode);
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||||
|
@ -3459,7 +3464,9 @@ struct llama_model_loader {
|
||||||
size_t size_data = 0;
|
size_t size_data = 0;
|
||||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||||
|
|
||||||
// Returns false if cancelled by progress_callback
|
|
||||||
|
|
||||||
|
// Returns false if canceled by progress_callback
|
||||||
bool load_all_data(
|
bool load_all_data(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
llama_buf_map & bufs_mmap,
|
llama_buf_map & bufs_mmap,
|
||||||
|
@ -3468,6 +3475,14 @@ struct llama_model_loader {
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
||||||
|
|
||||||
|
#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
|
struct ggml_tensor* last_tensor = nullptr;
|
||||||
|
|
||||||
|
// debug statistics
|
||||||
|
size_t total_data_read = 0;
|
||||||
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
#endif
|
||||||
|
|
||||||
std::vector<no_init<uint8_t>> read_buf;
|
std::vector<no_init<uint8_t>> read_buf;
|
||||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
const auto * weight = get_weight(ggml_get_name(cur));
|
const auto * weight = get_weight(ggml_get_name(cur));
|
||||||
|
@ -3511,16 +3526,39 @@ struct llama_model_loader {
|
||||||
file->seek(weight->offs, SEEK_SET);
|
file->seek(weight->offs, SEEK_SET);
|
||||||
file->read_raw(cur->data, ggml_nbytes(cur));
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
|
// backdoor to load tensors with DirectStorage
|
||||||
|
last_tensor = cur;
|
||||||
|
struct Temp {
|
||||||
|
const char* filename;
|
||||||
|
size_t weights_off;
|
||||||
|
};
|
||||||
|
|
||||||
|
Temp t;
|
||||||
|
t.filename = file->filename.c_str();
|
||||||
|
t.weights_off = weight->offs;
|
||||||
|
|
||||||
|
ggml_backend_tensor_set(cur, &t, 0, n_size | (1u << 31));
|
||||||
|
#else
|
||||||
read_buf.resize(ggml_nbytes(cur));
|
read_buf.resize(ggml_nbytes(cur));
|
||||||
file->seek(weight->offs, SEEK_SET);
|
file->seek(weight->offs, SEEK_SET);
|
||||||
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
||||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_done += n_size;
|
size_done += n_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
|
// trigger flush of unread data
|
||||||
|
if (last_tensor)
|
||||||
|
ggml_backend_tensor_set(last_tensor, 0, 0, 1u << 31);
|
||||||
|
#endif
|
||||||
|
|
||||||
// check if this is the last call and do final cleanup
|
// check if this is the last call and do final cleanup
|
||||||
if (size_done >= size_data) {
|
if (size_done >= size_data) {
|
||||||
// unmap offloaded tensors and metadata
|
// unmap offloaded tensors and metadata
|
||||||
|
@ -3541,6 +3579,14 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(ENABLE_DIRECT_STORAGE_CUDA)
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
std::chrono::duration<double, std::ratio<1,1>> delta(end - start);
|
||||||
|
//auto seconds = std::chrono::duration_cast<double, std::chrono::seconds>(delta);
|
||||||
|
std::cout << "load time: " << delta.count() << std::endl;;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -5874,6 +5920,7 @@ static bool llm_load_tensors(
|
||||||
// loading time will be recalculate after the first eval, so
|
// loading time will be recalculate after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||||
|
std::cout << "model load time: " << model.t_load_us / 1000.0f << "ms" << std::endl;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14213,7 +14260,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
||||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||||
#if defined(__linux__) || defined(_WIN32)
|
#if false && defined(__linux__) || defined(_WIN32)
|
||||||
constexpr bool use_mmap = true;
|
constexpr bool use_mmap = true;
|
||||||
#else
|
#else
|
||||||
constexpr bool use_mmap = false;
|
constexpr bool use_mmap = false;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue