From 32bc3f4fcf941dc6656b638074d28f94fdf48da2 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sat, 9 Sep 2023 23:04:53 -0400 Subject: [PATCH] llama : enable mmap in quantize on Linux -> 31% faster --- llama.cpp | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index a65026122..bfff91be1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5658,7 +5658,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - std::unique_ptr ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + // mmap consistently increases speed Linux, is inconsistent on macOS + // (possibly related to free memory), and has not been tested on Windows. +#ifdef __linux__ + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + std::unique_ptr ml(new llama_model_loader(fname_inp, use_mmap)); + if (ml->use_mmap) { + ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); + } llama_model model; llm_load_arch(*ml, model); @@ -5736,10 +5747,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(tensor); - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (!ml->use_mmap) { + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); ml->load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",