llama : enable mmap in quantize on Linux -> 31% faster
This commit is contained in:
parent
e6616cf0db
commit
32bc3f4fcf
1 changed files with 17 additions and 4 deletions
15
llama.cpp
15
llama.cpp
|
@ -5658,7 +5658,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
// mmap consistently increases speed Linux, is inconsistent on macOS
|
||||||
|
// (possibly related to free memory), and has not been tested on Windows.
|
||||||
|
#ifdef __linux__
|
||||||
|
constexpr bool use_mmap = true;
|
||||||
|
#else
|
||||||
|
constexpr bool use_mmap = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, use_mmap));
|
||||||
|
if (ml->use_mmap) {
|
||||||
|
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
||||||
|
}
|
||||||
|
|
||||||
llama_model model;
|
llama_model model;
|
||||||
llm_load_arch(*ml, model);
|
llm_load_arch(*ml, model);
|
||||||
|
@ -5736,10 +5747,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
|
if (!ml->use_mmap) {
|
||||||
if (read_data.size() < ggml_nbytes(tensor)) {
|
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||||
read_data.resize(ggml_nbytes(tensor));
|
read_data.resize(ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
tensor->data = read_data.data();
|
tensor->data = read_data.data();
|
||||||
|
}
|
||||||
ml->load_data_for(tensor);
|
ml->load_data_for(tensor);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue