From 1bfe5a98867a6057396b6ca4829cedc5c3ae0a10 Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Thu, 18 May 2023 23:57:13 +0200 Subject: [PATCH] fixup! GPU weights not in RAM, direct loading with cuFile --- ggml-cuda.cu | 48 ++++++++++++++++++++++++++++++++++++++---------- ggml-cuda.h | 4 +--- llama.cpp | 24 +----------------------- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 893c7d4a9..6e6e12aaa 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -383,7 +383,7 @@ struct cuda_buffer { static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { +static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -402,7 +402,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { return ptr; } -void ggml_cuda_pool_free(void * ptr, size_t size) { +static void ggml_cuda_pool_free(void * ptr, size_t size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -908,7 +908,7 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) { tensor->backend = GGML_BACKEND_CUDA; } -bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) { +void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) { CUfileDescr_t cf_descr; memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t)); const int fd_cf = open(fname, O_RDONLY|O_DIRECT, 0644); @@ -921,20 +921,48 @@ bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensor if (status.err == CU_FILE_INTERNAL_ERROR) { fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). " "This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.\n", fname); + } else { + for (int i = 0; i < num_tensors; ++i) { + ggml_tensor * tensor = tensors[i]; + const size_t size = ggml_nbytes(tensor); + const size_t offset = offsets[i]; + + size_t actual_size; + void * buf = ggml_cuda_pool_malloc(size, &actual_size); + cuFileRead(cf_handle, buf, size, offset, 0); + tensor->data = buf; + } + return; } - if (status.err != CU_FILE_SUCCESS) { - return false; - } + + FILE * fp = fopen(fname, "rb"); for (int i = 0; i < num_tensors; ++i) { ggml_tensor * tensor = tensors[i]; const size_t size = ggml_nbytes(tensor); const size_t offset = offsets[i]; - size_t actual_size; - void * buf = ggml_cuda_pool_malloc(size, &actual_size); - cuFileRead(cf_handle, buf, size, offset, 0); + void * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + void * buf_host = malloc(size); + +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, SEEK_SET); +#else + int ret = fseek(fp, (long) offset, SEEK_SET); +#endif + GGML_ASSERT(ret == 0); // same + + size_t ret2 = fread(buf_host, size, 1, fp); + if (ret2 != 1) { + fprintf(stderr, "unexpectedly reached end of file"); + exit(1); + } + + cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + tensor->data = buf; + free(buf_host); } - return true; } diff --git a/ggml-cuda.h b/ggml-cuda.h index 7485af9f7..ab2c690b0 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -14,11 +14,9 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens // TODO: export these with GGML_API void * ggml_cuda_host_malloc(size_t size); void ggml_cuda_host_free(void * ptr); -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); -void ggml_cuda_pool_free(void * ptr, size_t size); void ggml_cuda_transform_tensor(struct ggml_tensor * tensor); -bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets); +void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 644b6a57b..8312ffedf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10,7 +10,6 @@ #include "ggml.h" #ifdef GGML_USE_CUBLAS -#include #include "ggml-cuda.h" #endif @@ -1069,28 +1068,7 @@ static void llama_model_load_internal( LLAMA_ASSERT(lt.shards.size() == 1); offsets.emplace_back(lt.shards.at(0).file_off); } - bool cufile_success = ggml_cuda_load_data_cufile(fname.c_str(), tensors.data(), tensors.size(), offsets.data()); - - if (!cufile_success) { - for (llama_load_tensor & lt : ml->tensors_map.tensors) { - if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) { - continue; - } - size_t actual_size; - void * buf = ggml_cuda_pool_malloc(lt.size, &actual_size); - void * buf_host = ggml_cuda_host_malloc(lt.size); - - llama_file & file = ml->file_loaders.at(lt.shards.at(0).file_idx)->file; - file.seek(lt.shards.at(0).file_off, SEEK_SET); - file.read_raw(buf_host, lt.size); - - cudaMemcpy(buf, buf_host, lt.size, cudaMemcpyHostToDevice); - cudaDeviceSynchronize(); - - lt.ggml_tensor->data = buf; - ggml_cuda_host_free(buf_host); - } - } + ggml_cuda_load_data(fname.c_str(), tensors.data(), tensors.size(), offsets.data()); } #endif // GGML_USE_CUBLAS