From 901b86b2966f74d76732d40b7876b84087f49fe7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 5 Jun 2024 16:53:19 +0300
Subject: [PATCH] imatrix : add --save-frequency cli arg

---
 common/common.cpp            | 11 +++++++-
 common/common.h              |  5 ++--
 examples/imatrix/README.md   |  5 ++--
 examples/imatrix/imatrix.cpp | 49 +++++++++++++++++-------------------
 4 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 14f4fb50c..1c5c05407 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1576,6 +1576,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.n_out_freq = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "--save-frequency") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_save_freq = std::stoi(argv[i]);
+        return true;
+    }
     if (arg == "--process-output") {
         params.process_output = true;
         return true;
@@ -1863,7 +1871,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
     options.push_back({ "imatrix" });
     options.push_back({ "imatrix",     "-o,    --output FNAME",         "output file (default: '%s')", params.out_file.c_str() });
-    options.push_back({ "imatrix",     "       --output-frequency N",   "output every N iterations (default: %d)", params.n_out_freq });
+    options.push_back({ "imatrix",     "       --output-frequency N",   "output the imatrix every N iterations (default: %d)", params.n_out_freq });
+    options.push_back({ "imatrix",     "       --save-frequency N",     "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
     options.push_back({ "imatrix",     "       --process-output",       "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
     options.push_back({ "imatrix",     "       --no-ppl",               "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
     options.push_back({ "imatrix",     "       --chunk N",              "start processing the input from chunk N (default: %d)", params.i_chunk });
diff --git a/common/common.h b/common/common.h
index a6a9717d7..de6238e27 100644
--- a/common/common.h
+++ b/common/common.h
@@ -224,8 +224,9 @@ struct gpt_params {
     // imatrix params
     std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
 
-    int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
-    int32_t i_chunk    = 0;  // start processing from this chunk
+    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
+    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
+    int32_t i_chunk     =  0; // start processing from this chunk
 
     bool process_output = false; // collect data for the output tensor
     bool compute_ppl    = true;  // whether to compute perplexity
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 8670690d5..866ca9f56 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -7,8 +7,8 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
 
 ```
 ./imatrix \
-    -m model.gguf -f some-text.txt [-o imatrix.dat] [--verbosity 1] \
-    [--process-output] [--no-ppl] [--chunk 123] [--output-frequency 10] \
+    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
+    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
     [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
 ```
 
@@ -17,6 +17,7 @@ The parameters in square brackets are optional and have the following meaning:
 * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
 * `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
 * `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 
 For faster computation, make sure to use GPU offloading via the `-ngl` argument
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index b99ebb7e1..38420041c 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -22,8 +22,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt -o imatrix.dat --verbosity 1 \\\n"
-            "       [--process-output] [--no-ppl] [--chunk 123] [--output-frequency 10] \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
             "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
     LOG_TEE("\n");
 }
@@ -39,7 +39,7 @@ public:
     IMatrixCollector() = default;
     void set_params(gpt_params params) { m_params = std::move(params); }
     bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix() const;
+    void save_imatrix(int ncall = -1) const;
     bool load_imatrix(const char * file_name);
 private:
     std::unordered_map<std::string, Stats> m_stats;
@@ -48,9 +48,6 @@ private:
     int                                    m_last_call = 0;
     std::vector<float>                     m_src1_data;
     std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
-                                                  //
-    void save_imatrix(const char * file_name, const char * dataset) const;
-    void keep_imatrix(int ncall) const;
 };
 
 // remove any prefix and suffixes from the name
@@ -162,8 +159,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                 if (m_last_call % m_params.n_out_freq == 0) {
                     save_imatrix();
                 }
-                if (m_params.n_keep > 0 && m_last_call%m_params.n_keep == 0) {
-                    keep_imatrix(m_last_call);
+                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                    save_imatrix(m_last_call);
                 }
             }
         }
@@ -193,8 +190,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             if (m_last_call % m_params.n_out_freq == 0) {
                 save_imatrix();
             }
-            if (m_params.n_keep > 0 && m_last_call%m_params.n_keep == 0) {
-                keep_imatrix(m_last_call);
+            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                save_imatrix(m_last_call);
             }
         }
     }
@@ -202,19 +199,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     return true;
 }
 
-void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.out_file.empty() ? "imatrix.dat" : m_params.out_file.c_str(), m_params.prompt_file.c_str());
-}
+void IMatrixCollector::save_imatrix(int ncall) const {
+    auto fname = m_params.out_file;
+    if (fname.empty()) {
+        fname = "imatrix.dat";
+    }
 
-void IMatrixCollector::keep_imatrix(int ncall) const {
-    auto file_name = m_params.out_file;
-    if (file_name.empty()) file_name = "imatrix.dat";
-    file_name += ".at_";
-    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str(), m_params.prompt_file.c_str());
-}
+    if (ncall > 0) {
+        fname += ".at_";
+        fname += std::to_string(ncall);
+    }
 
-void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
     std::ofstream out(fname, std::ios::binary);
     int n_entries = m_stats.size();
     out.write((const char *) &n_entries, sizeof(n_entries));
@@ -237,13 +232,15 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
     // Write the number of call the matrix was computed with
     out.write((const char *) &m_last_call, sizeof(m_last_call));
 
-    // Write the dataset name at the end of the file to later on specify it in quantize
-    int n_dataset = strlen(dataset);
-    out.write((const char *) &n_dataset, sizeof(n_dataset));
-    out.write(dataset, n_dataset);
+    // Write the input filename at the end of the file to later on specify it in quantize
+    {
+        int len = m_params.prompt_file.size();
+        out.write((const char *) &len, sizeof(len));
+        out.write(m_params.prompt_file.c_str(), len);
+    }
 
     if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
     }
 }