diff --git a/llama-util.h b/llama-util.h index 88ec28dca..e775f64cc 100644 --- a/llama-util.h +++ b/llama-util.h @@ -172,7 +172,7 @@ struct llama_mmap { #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file, bool prefetch = true) { + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) { size = file->size; int fd = fileno(file->fp); int flags = MAP_SHARED; @@ -184,9 +184,9 @@ struct llama_mmap { throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } - if (prefetch) { + if (prefetch > 0) { // Advise the kernel to preload the mapped memory - if (madvise(addr, file->size, MADV_WILLNEED)) { + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", strerror(errno)); } diff --git a/llama.cpp b/llama.cpp index 8312ffedf..6909db1ac 100644 --- a/llama.cpp +++ b/llama.cpp @@ -679,12 +679,16 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; + size_t prefetch_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + prefetch_size += lt.size; + } } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loaders.at(0)->file, false)); + mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size)); if (!lmlock) { // Don't call the callback since the actual loading will be lazy // and we can't measure it. @@ -2317,7 +2321,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // maybe this should in llama_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0)); } }