fixup! GPU weights not in RAM, direct loading with cuFile

This commit is contained in:
JohannesGaessler 2023-05-18 23:57:13 +02:00
parent fa1a29f36f
commit 1bfe5a9886
3 changed files with 40 additions and 36 deletions

View file

@ -383,7 +383,7 @@ struct cuda_buffer {
static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@ -402,7 +402,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr;
}
void ggml_cuda_pool_free(void * ptr, size_t size) {
static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@ -908,7 +908,7 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
tensor->backend = GGML_BACKEND_CUDA;
}
bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) {
void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, const int num_tensors, const size_t * offsets) {
CUfileDescr_t cf_descr;
memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
const int fd_cf = open(fname, O_RDONLY|O_DIRECT, 0644);
@ -921,20 +921,48 @@ bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensor
if (status.err == CU_FILE_INTERNAL_ERROR) {
fprintf(stderr, "WARNING: cuFile experienced an internal error while loading weights from \"%s\". Using a workaround (slower). "
"This happens with weight files on Btrfs partitions. ext4 and NTFS are confirmed to work.\n", fname);
} else {
for (int i = 0; i < num_tensors; ++i) {
ggml_tensor * tensor = tensors[i];
const size_t size = ggml_nbytes(tensor);
const size_t offset = offsets[i];
size_t actual_size;
void * buf = ggml_cuda_pool_malloc(size, &actual_size);
cuFileRead(cf_handle, buf, size, offset, 0);
tensor->data = buf;
}
return;
}
if (status.err != CU_FILE_SUCCESS) {
return false;
}
FILE * fp = fopen(fname, "rb");
for (int i = 0; i < num_tensors; ++i) {
ggml_tensor * tensor = tensors[i];
const size_t size = ggml_nbytes(tensor);
const size_t offset = offsets[i];
size_t actual_size;
void * buf = ggml_cuda_pool_malloc(size, &actual_size);
cuFileRead(cf_handle, buf, size, offset, 0);
void * buf;
CUDA_CHECK(cudaMalloc(&buf, size));
void * buf_host = malloc(size);
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
#else
int ret = fseek(fp, (long) offset, SEEK_SET);
#endif
GGML_ASSERT(ret == 0); // same
size_t ret2 = fread(buf_host, size, 1, fp);
if (ret2 != 1) {
fprintf(stderr, "unexpectedly reached end of file");
exit(1);
}
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
tensor->data = buf;
free(buf_host);
}
return true;
}

View file

@ -14,11 +14,9 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
// TODO: export these with GGML_API
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
void ggml_cuda_pool_free(void * ptr, size_t size);
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
bool ggml_cuda_load_data_cufile(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets);
void ggml_cuda_load_data(const char * fname, struct ggml_tensor ** tensors, int num_tensors, const size_t * offsets);
#ifdef __cplusplus
}

View file

@ -10,7 +10,6 @@
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include <cuda_runtime.h>
#include "ggml-cuda.h"
#endif
@ -1069,28 +1068,7 @@ static void llama_model_load_internal(
LLAMA_ASSERT(lt.shards.size() == 1);
offsets.emplace_back(lt.shards.at(0).file_off);
}
bool cufile_success = ggml_cuda_load_data_cufile(fname.c_str(), tensors.data(), tensors.size(), offsets.data());
if (!cufile_success) {
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
continue;
}
size_t actual_size;
void * buf = ggml_cuda_pool_malloc(lt.size, &actual_size);
void * buf_host = ggml_cuda_host_malloc(lt.size);
llama_file & file = ml->file_loaders.at(lt.shards.at(0).file_idx)->file;
file.seek(lt.shards.at(0).file_off, SEEK_SET);
file.read_raw(buf_host, lt.size);
cudaMemcpy(buf, buf_host, lt.size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
lt.ggml_tensor->data = buf;
ggml_cuda_host_free(buf_host);
}
}
ggml_cuda_load_data(fname.c_str(), tensors.data(), tensors.size(), offsets.data());
}
#endif // GGML_USE_CUBLAS