From 901b86b2966f74d76732d40b7876b84087f49fe7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 5 Jun 2024 16:53:19 +0300 Subject: [PATCH] imatrix : add --save-frequency cli arg --- common/common.cpp | 11 +++++++- common/common.h | 5 ++-- examples/imatrix/README.md | 5 ++-- examples/imatrix/imatrix.cpp | 49 +++++++++++++++++------------------- 4 files changed, 39 insertions(+), 31 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 14f4fb50c..1c5c05407 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1576,6 +1576,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.n_out_freq = std::stoi(argv[i]); return true; } + if (arg == "--save-frequency") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_save_freq = std::stoi(argv[i]); + return true; + } if (arg == "--process-output") { params.process_output = true; return true; @@ -1863,7 +1871,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "imatrix" }); options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); - options.push_back({ "imatrix", " --output-frequency N", "output every N iterations (default: %d)", params.n_out_freq }); + options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); + options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); diff --git a/common/common.h b/common/common.h index a6a9717d7..de6238e27 100644 --- a/common/common.h +++ b/common/common.h @@ -224,8 +224,9 @@ struct gpt_params { // imatrix params std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file - int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations - int32_t i_chunk = 0; // start processing from this chunk + int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations + int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations + int32_t i_chunk = 0; // start processing from this chunk bool process_output = false; // collect data for the output tensor bool compute_ppl = true; // whether to compute perplexity diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 8670690d5..866ca9f56 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -7,8 +7,8 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/ ``` ./imatrix \ - -m model.gguf -f some-text.txt [-o imatrix.dat] [--verbosity 1] \ - [--process-output] [--no-ppl] [--chunk 123] [--output-frequency 10] \ + -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ + [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] ``` @@ -17,6 +17,7 @@ The parameters in square brackets are optional and have the following meaning: * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. * `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) * `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. For faster computation, make sure to use GPU offloading via the `-ngl` argument diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index b99ebb7e1..38420041c 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -22,8 +22,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s \\\n" - " -m model.gguf -f some-text.txt -o imatrix.dat --verbosity 1 \\\n" - " [--process-output] [--no-ppl] [--chunk 123] [--output-frequency 10] \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" + " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); LOG_TEE("\n"); } @@ -39,7 +39,7 @@ public: IMatrixCollector() = default; void set_params(gpt_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); - void save_imatrix() const; + void save_imatrix(int ncall = -1) const; bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; @@ -48,9 +48,6 @@ private: int m_last_call = 0; std::vector m_src1_data; std::vector m_ids; // the expert ids from ggml_mul_mat_id - // - void save_imatrix(const char * file_name, const char * dataset) const; - void keep_imatrix(int ncall) const; }; // remove any prefix and suffixes from the name @@ -162,8 +159,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * if (m_last_call % m_params.n_out_freq == 0) { save_imatrix(); } - if (m_params.n_keep > 0 && m_last_call%m_params.n_keep == 0) { - keep_imatrix(m_last_call); + if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { + save_imatrix(m_last_call); } } } @@ -193,8 +190,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * if (m_last_call % m_params.n_out_freq == 0) { save_imatrix(); } - if (m_params.n_keep > 0 && m_last_call%m_params.n_keep == 0) { - keep_imatrix(m_last_call); + if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { + save_imatrix(m_last_call); } } } @@ -202,19 +199,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * return true; } -void IMatrixCollector::save_imatrix() const { - save_imatrix(m_params.out_file.empty() ? "imatrix.dat" : m_params.out_file.c_str(), m_params.prompt_file.c_str()); -} +void IMatrixCollector::save_imatrix(int ncall) const { + auto fname = m_params.out_file; + if (fname.empty()) { + fname = "imatrix.dat"; + } -void IMatrixCollector::keep_imatrix(int ncall) const { - auto file_name = m_params.out_file; - if (file_name.empty()) file_name = "imatrix.dat"; - file_name += ".at_"; - file_name += std::to_string(ncall); - save_imatrix(file_name.c_str(), m_params.prompt_file.c_str()); -} + if (ncall > 0) { + fname += ".at_"; + fname += std::to_string(ncall); + } -void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const { std::ofstream out(fname, std::ios::binary); int n_entries = m_stats.size(); out.write((const char *) &n_entries, sizeof(n_entries)); @@ -237,13 +232,15 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co // Write the number of call the matrix was computed with out.write((const char *) &m_last_call, sizeof(m_last_call)); - // Write the dataset name at the end of the file to later on specify it in quantize - int n_dataset = strlen(dataset); - out.write((const char *) &n_dataset, sizeof(n_dataset)); - out.write(dataset, n_dataset); + // Write the input filename at the end of the file to later on specify it in quantize + { + int len = m_params.prompt_file.size(); + out.write((const char *) &len, sizeof(len)); + out.write(m_params.prompt_file.c_str(), len); + } if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } }