fixup! GPU weights not in RAM, direct loading with cuFile

This commit is contained in:
JohannesGaessler 2023-05-19 10:13:20 +02:00
parent 1bfe5a9886
commit 24d5ddf67c
2 changed files with 9 additions and 5 deletions

View file

@ -172,7 +172,7 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES #ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
size = file->size; size = file->size;
int fd = fileno(file->fp); int fd = fileno(file->fp);
int flags = MAP_SHARED; int flags = MAP_SHARED;
@ -184,9 +184,9 @@ struct llama_mmap {
throw std::runtime_error(format("mmap failed: %s", strerror(errno))); throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
} }
if (prefetch) { if (prefetch > 0) {
// Advise the kernel to preload the mapped memory // Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) { if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno)); strerror(errno));
} }

View file

@ -679,12 +679,16 @@ struct llama_model_loader {
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0; size_t data_size = 0;
size_t prefetch_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size; data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
prefetch_size += lt.size;
}
} }
if (use_mmap) { if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, false)); mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
if (!lmlock) { if (!lmlock) {
// Don't call the callback since the actual loading will be lazy // Don't call the callback since the actual loading will be lazy
// and we can't measure it. // and we can't measure it.
@ -2317,7 +2321,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
// maybe this should in llama_model_loader // maybe this should in llama_model_loader
if (model_loader->use_mmap) { if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
} }
} }