imatrix : add --save-frequency cli arg
This commit is contained in:
parent
cbe51d7f3d
commit
901b86b296
4 changed files with 39 additions and 31 deletions
|
@ -1576,6 +1576,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.n_out_freq = std::stoi(argv[i]);
|
params.n_out_freq = std::stoi(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--save-frequency") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.n_save_freq = std::stoi(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--process-output") {
|
if (arg == "--process-output") {
|
||||||
params.process_output = true;
|
params.process_output = true;
|
||||||
return true;
|
return true;
|
||||||
|
@ -1863,7 +1871,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
|
|
||||||
options.push_back({ "imatrix" });
|
options.push_back({ "imatrix" });
|
||||||
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
|
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
|
||||||
options.push_back({ "imatrix", " --output-frequency N", "output every N iterations (default: %d)", params.n_out_freq });
|
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
|
||||||
|
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
|
||||||
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
|
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
|
||||||
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
|
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
|
||||||
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
|
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
|
||||||
|
|
|
@ -225,6 +225,7 @@ struct gpt_params {
|
||||||
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
||||||
|
|
||||||
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||||
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||||
int32_t i_chunk = 0; // start processing from this chunk
|
int32_t i_chunk = 0; // start processing from this chunk
|
||||||
|
|
||||||
bool process_output = false; // collect data for the output tensor
|
bool process_output = false; // collect data for the output tensor
|
||||||
|
|
|
@ -7,8 +7,8 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
|
||||||
|
|
||||||
```
|
```
|
||||||
./imatrix \
|
./imatrix \
|
||||||
-m model.gguf -f some-text.txt [-o imatrix.dat] [--verbosity 1] \
|
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
||||||
[--process-output] [--no-ppl] [--chunk 123] [--output-frequency 10] \
|
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
||||||
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ The parameters in square brackets are optional and have the following meaning:
|
||||||
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
|
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
|
||||||
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
||||||
* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
||||||
|
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
|
||||||
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
||||||
|
|
||||||
For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
|
|
|
@ -22,8 +22,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG_TEE("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s \\\n"
|
LOG_TEE("\n %s \\\n"
|
||||||
" -m model.gguf -f some-text.txt -o imatrix.dat --verbosity 1 \\\n"
|
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
|
||||||
" [--process-output] [--no-ppl] [--chunk 123] [--output-frequency 10] \\\n"
|
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
||||||
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
}
|
}
|
||||||
|
@ -39,7 +39,7 @@ public:
|
||||||
IMatrixCollector() = default;
|
IMatrixCollector() = default;
|
||||||
void set_params(gpt_params params) { m_params = std::move(params); }
|
void set_params(gpt_params params) { m_params = std::move(params); }
|
||||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
void save_imatrix() const;
|
void save_imatrix(int ncall = -1) const;
|
||||||
bool load_imatrix(const char * file_name);
|
bool load_imatrix(const char * file_name);
|
||||||
private:
|
private:
|
||||||
std::unordered_map<std::string, Stats> m_stats;
|
std::unordered_map<std::string, Stats> m_stats;
|
||||||
|
@ -48,9 +48,6 @@ private:
|
||||||
int m_last_call = 0;
|
int m_last_call = 0;
|
||||||
std::vector<float> m_src1_data;
|
std::vector<float> m_src1_data;
|
||||||
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
|
||||||
//
|
|
||||||
void save_imatrix(const char * file_name, const char * dataset) const;
|
|
||||||
void keep_imatrix(int ncall) const;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// remove any prefix and suffixes from the name
|
// remove any prefix and suffixes from the name
|
||||||
|
@ -162,8 +159,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
if (m_last_call % m_params.n_out_freq == 0) {
|
if (m_last_call % m_params.n_out_freq == 0) {
|
||||||
save_imatrix();
|
save_imatrix();
|
||||||
}
|
}
|
||||||
if (m_params.n_keep > 0 && m_last_call%m_params.n_keep == 0) {
|
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
|
||||||
keep_imatrix(m_last_call);
|
save_imatrix(m_last_call);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -193,8 +190,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
if (m_last_call % m_params.n_out_freq == 0) {
|
if (m_last_call % m_params.n_out_freq == 0) {
|
||||||
save_imatrix();
|
save_imatrix();
|
||||||
}
|
}
|
||||||
if (m_params.n_keep > 0 && m_last_call%m_params.n_keep == 0) {
|
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
|
||||||
keep_imatrix(m_last_call);
|
save_imatrix(m_last_call);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -202,19 +199,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void IMatrixCollector::save_imatrix() const {
|
void IMatrixCollector::save_imatrix(int ncall) const {
|
||||||
save_imatrix(m_params.out_file.empty() ? "imatrix.dat" : m_params.out_file.c_str(), m_params.prompt_file.c_str());
|
auto fname = m_params.out_file;
|
||||||
|
if (fname.empty()) {
|
||||||
|
fname = "imatrix.dat";
|
||||||
}
|
}
|
||||||
|
|
||||||
void IMatrixCollector::keep_imatrix(int ncall) const {
|
if (ncall > 0) {
|
||||||
auto file_name = m_params.out_file;
|
fname += ".at_";
|
||||||
if (file_name.empty()) file_name = "imatrix.dat";
|
fname += std::to_string(ncall);
|
||||||
file_name += ".at_";
|
|
||||||
file_name += std::to_string(ncall);
|
|
||||||
save_imatrix(file_name.c_str(), m_params.prompt_file.c_str());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
|
|
||||||
std::ofstream out(fname, std::ios::binary);
|
std::ofstream out(fname, std::ios::binary);
|
||||||
int n_entries = m_stats.size();
|
int n_entries = m_stats.size();
|
||||||
out.write((const char *) &n_entries, sizeof(n_entries));
|
out.write((const char *) &n_entries, sizeof(n_entries));
|
||||||
|
@ -237,13 +232,15 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
|
||||||
// Write the number of call the matrix was computed with
|
// Write the number of call the matrix was computed with
|
||||||
out.write((const char *) &m_last_call, sizeof(m_last_call));
|
out.write((const char *) &m_last_call, sizeof(m_last_call));
|
||||||
|
|
||||||
// Write the dataset name at the end of the file to later on specify it in quantize
|
// Write the input filename at the end of the file to later on specify it in quantize
|
||||||
int n_dataset = strlen(dataset);
|
{
|
||||||
out.write((const char *) &n_dataset, sizeof(n_dataset));
|
int len = m_params.prompt_file.size();
|
||||||
out.write(dataset, n_dataset);
|
out.write((const char *) &len, sizeof(len));
|
||||||
|
out.write(m_params.prompt_file.c_str(), len);
|
||||||
|
}
|
||||||
|
|
||||||
if (m_params.verbosity > 0) {
|
if (m_params.verbosity > 0) {
|
||||||
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
|
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue