imatrix : use GGUF to store imatrix data

This commit is contained in:
Francis Couture-Harpin 2024-09-06 17:17:25 -04:00
parent 347247a24e
commit 3de9300c37
4 changed files with 352 additions and 149 deletions

View file

@ -0,0 +1,118 @@
#!/usr/bin/env python3
from __future__ import annotations
import os
import sys
import logging
import argparse
from typing import Any
from pathlib import Path
from dataclasses import dataclass
import numpy as np
import numpy.typing as npt
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
logger = logging.getLogger("imatrix-to-gguf")
class IMatrixWriter(gguf.GGUFWriter):
def add_architecture(self) -> None:
# no arch is stored in imatrix files
pass
@dataclass
class IMatrixEntry:
values: np.ndarray[Any, np.dtype[np.float32]]
counts: np.ndarray[Any, np.dtype[np.float32]]
class IMatrixReader:
chunk_size: int = 512 # guess
offset: int = 0
data: np.ndarray[Any, np.dtype[np.uint8]]
n_enties: int
entries: dict[str, IMatrixEntry]
chunk_count: int
dataset: str
def _get(self, dtype: npt.DTypeLike, count: int = 1) -> npt.NDArray[Any]:
count = int(count)
itemsize = int(np.empty([], dtype=dtype).itemsize)
offset = self.offset
self.offset = offset + itemsize * count
return self.data[offset:self.offset].view(dtype=dtype)[:count]
def __init__(self, imatrix: Path):
self.offset = 0
self.entries = {}
self.data = np.memmap(imatrix)
n_entries = self._get(np.int32).item()
assert n_entries >= 0
for _ in range(n_entries):
len = self._get(np.int32).item()
name = self._get(np.uint8, len).tobytes().decode("utf-8")
ncall = self._get(np.int32).item()
nval = self._get(np.int32).item()
data = self._get(np.float32, nval)
assert name not in self.entries, f"duplicated name: {name!r}"
self.entries[name] = IMatrixEntry(data, np.array([ncall * self.chunk_size], dtype=np.float32))
self.chunk_count = self._get(np.int32).item()
self.dataset = self._get(np.uint8, self._get(np.int32).item()).tobytes().decode("utf-8")
def to_writer(self, outfile: Path) -> IMatrixWriter:
writer = IMatrixWriter(path=outfile, arch="")
writer.add_type(gguf.GGUFType.IMATRIX)
writer.add_key_value(gguf.Keys.IMatrix.CHUNK_COUNT, self.chunk_count, gguf.GGUFValueType.UINT32)
writer.add_key_value(gguf.Keys.IMatrix.CHUNK_SIZE, self.chunk_size, gguf.GGUFValueType.UINT32)
writer.add_key_value(gguf.Keys.IMatrix.DATASET, self.dataset, gguf.GGUFValueType.STRING)
for name, entry in self.entries.items():
writer.add_tensor(name + ".sums", entry.values)
writer.add_tensor(name + ".counts", entry.counts)
return writer
def parse_args():
parser = argparse.ArgumentParser(
description="Convert an old imatrix.dat file to a GGUF compatible file")
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input.",
)
parser.add_argument(
"--verbose", action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"imatrix", type=Path,
help="path to an imatrix file",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.outfile is None:
input_file: Path = args.imatrix
if input_file.suffix != ".gguf":
args.outfile = input_file.with_suffix(".gguf")
writer = IMatrixReader(args.imatrix).to_writer(args.outfile)
writer.write_header_to_file(args.outfile)
writer.write_kv_data_to_file()
writer.write_tensors_to_file()

View file

@ -5,11 +5,9 @@
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <sstream>
#include <thread> #include <thread>
#include <mutex> #include <mutex>
#include <vector> #include <vector>
#include <fstream>
#include <unordered_map> #include <unordered_map>
#include <algorithm> #include <algorithm>
@ -22,16 +20,19 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
LOG_TEE("\nexample usage:\n"); LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n" LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" " -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); " [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...]\n" , argv[0]);
LOG_TEE("\n"); LOG_TEE("\n");
} }
static const char * const LLM_KV_IMATRIX_DATASET = "imatrix.dataset";
static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
struct Stats { struct Stats {
std::vector<float> values; std::vector<double> values;
std::vector<int> counts; std::vector<int64_t> counts;
int ncall = 0;
}; };
class IMatrixCollector { class IMatrixCollector {
@ -39,13 +40,13 @@ public:
IMatrixCollector() = default; IMatrixCollector() = default;
void set_params(gpt_params params) { m_params = std::move(params); } void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const; void save_imatrix(int32_t n_chunk = -1) const;
bool load_imatrix(const char * file_name); bool load_imatrix(const char * file_name);
private: private:
std::unordered_map<std::string, Stats> m_stats; std::unordered_map<std::string, Stats> m_stats;
gpt_params m_params; gpt_params m_params;
std::mutex m_mutex; std::mutex m_mutex;
int m_last_call = 0; int32_t m_last_chunk = 0;
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
}; };
@ -119,18 +120,24 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto & e = m_stats[wname]; auto & e = m_stats[wname];
++e.ncall; if (e.counts.size() == 1 && n_as > 1) {
// broadcast, when loading an old imatrix
e.counts.resize(n_as, e.counts[0]);
}
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0); e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0); e.counts.resize(n_as, 0);
} }
else if (e.values.size() != (size_t)src1->ne[0]*n_as) { else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ABORT("fatal error"); exit(1); //GGML_ABORT("fatal error");
} }
else if (e.counts.size() != (size_t)n_as) {
fprintf(stderr, "Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), (int)n_as);
exit(1); //GGML_ABORT("fatal error");
}
if (m_params.verbosity > 1) { if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
} }
// loop over all possible experts, regardless if they are used or not in the batch // loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) { for (int ex = 0; ex < n_as; ++ex) {
@ -148,23 +155,26 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const int64_t i12 = row; const int64_t i12 = row;
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]); const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
e.counts[ex]++;
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j]; e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++; if (!std::isfinite((float)e.values[e_start + j])) {
if (!std::isfinite(e.values[e_start + j])) { fprintf(stderr, "%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1); exit(1);
} }
} }
} }
} }
if (e.ncall > m_last_call) { const int32_t n_chunk = e.counts[ex] / (m_params.n_ctx / m_params.n_parallel);
m_last_call = e.ncall; if (n_chunk > m_last_chunk) {
if (m_last_call % m_params.n_out_freq == 0) { const int32_t chunk_step = n_chunk - m_last_chunk;
m_last_chunk = n_chunk;
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
save_imatrix(m_last_call); save_imatrix(m_last_chunk);
} }
} }
} }
@ -172,34 +182,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto & e = m_stats[wname]; auto & e = m_stats[wname];
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(src1->ne[0], 0); e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0); e.counts.resize(1, 0);
} }
else if (e.values.size() != (size_t)src1->ne[0]) { else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ABORT("fatal error"); exit(1); //GGML_ABORT("fatal error");
} }
++e.ncall; else if (e.counts.size() != 1) {
if (m_params.verbosity > 1) { fprintf(stderr, "Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), 1);
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); exit(1); //GGML_ABORT("fatal error");
} }
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
// TODO: higher dimensions
for (int row = 0; row < (int)src1->ne[1]; ++row) { for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0]; const float * x = data + row * src1->ne[0];
e.counts[0]++;
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j]; e.values[j] += x[j]*x[j];
e.counts[j]++; if (!std::isfinite((float)e.values[j])) {
if (!std::isfinite(e.values[j])) { fprintf(stderr, "%f detected in %s\n", (float)e.values[j], wname.c_str());
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
exit(1); exit(1);
} }
} }
} }
if (e.ncall > m_last_call) { const int32_t n_chunk = e.counts[0] / (m_params.n_ctx / m_params.n_parallel);
m_last_call = e.ncall; if (n_chunk > m_last_chunk) {
if (m_last_call % m_params.n_out_freq == 0) { const int32_t chunk_step = n_chunk - m_last_chunk;
m_last_chunk = n_chunk;
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
save_imatrix(m_last_call); save_imatrix(m_last_chunk);
} }
} }
} }
@ -207,15 +223,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true; return true;
} }
void IMatrixCollector::save_imatrix(int ncall) const { void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
auto fname = m_params.out_file; auto fname = m_params.out_file;
if (fname.empty()) { if (fname.empty()) {
fname = "imatrix.dat"; fname = "imatrix.gguf";
} }
if (ncall > 0) { if (n_chunk > 0) {
fname += ".at_"; fname += ".at_";
fname += std::to_string(ncall); fname += std::to_string(n_chunk);
} }
// avoid writing imatrix entries that do not have full data // avoid writing imatrix entries that do not have full data
@ -223,6 +239,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
int n_entries = 0; int n_entries = 0;
std::vector<std::string> to_store; std::vector<std::string> to_store;
size_t data_size = 0;
bool is_first = true; // for printing bool is_first = true; // for printing
for (const auto & kv : m_stats) { for (const auto & kv : m_stats) {
@ -256,100 +273,132 @@ void IMatrixCollector::save_imatrix(int ncall) const {
n_entries++; n_entries++;
to_store.push_back(kv.first); to_store.push_back(kv.first);
data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
} }
if (to_store.size() < m_stats.size()) { if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
} }
std::ofstream out(fname, std::ios::binary); struct ggml_init_params params = {
out.write((const char *) &n_entries, sizeof(n_entries)); .mem_size = data_size,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx = ggml_init(params);
struct gguf_context * ctx_gguf = gguf_init_empty();
gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
// Write the input filename to later on specify it in quantize
gguf_set_val_str(ctx_gguf, LLM_KV_IMATRIX_DATASET, m_params.prompt_file.c_str());
// Write the number of chunks the matrix was computed with
gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
for (const auto & name : to_store) { for (const auto & name : to_store) {
const auto & stat = m_stats.at(name); const auto & stat = m_stats.at(name);
int len = name.size(); const int32_t nval = (int32_t) stat.values.size();
out.write((const char *) &len, sizeof(len)); const int32_t nmat = (int32_t) stat.counts.size();
out.write(name.c_str(), len);
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
int nval = stat.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) { if (nval > 0) {
std::vector<float> tmp(nval); struct ggml_tensor * sums = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
for (int i = 0; i < nval; i++) { struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall); ggml_set_name(sums, (name + ".sums").c_str());
ggml_set_name(counts, (name + ".counts").c_str());
for (int32_t j = 0; j < nval; ++j) {
((float *) sums->data)[j] = (float) stat.values[j];
} }
out.write((const char*)tmp.data(), nval*sizeof(float)); for (int32_t j = 0; j < nmat; ++j) {
((float *) counts->data)[j] = (float) stat.counts[j];
}
gguf_add_tensor(ctx_gguf, sums);
gguf_add_tensor(ctx_gguf, counts);
} }
} }
// Write the number of call the matrix was computed with gguf_write_to_file(ctx_gguf, fname.c_str(), false);
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the input filename at the end of the file to later on specify it in quantize
{
int len = m_params.prompt_file.size();
out.write((const char *) &len, sizeof(len));
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) { if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
} }
gguf_free(ctx_gguf);
ggml_free(ctx);
} }
bool IMatrixCollector::load_imatrix(const char * fname) { bool IMatrixCollector::load_imatrix(const char * file_name) {
std::ifstream in(fname, std::ios::binary); struct ggml_context * ctx = nullptr;
if (!in) { struct gguf_init_params meta_gguf_params = {
printf("%s: failed to open %s\n",__func__, fname); /* .no_alloc = */ false, // the data is needed
/* .ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
if (!ctx_gguf) {
return false; return false;
} }
int n_entries; const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
in.read((char*)&n_entries, sizeof(n_entries)); if (n_entries < 2) {
if (in.fail() || n_entries < 1) { fprintf(stderr, "%s: no data in file %s\n", __func__, file_name);
printf("%s: no data in file %s\n", __func__, fname); gguf_free(ctx_gguf);
ggml_free(ctx);
return false; return false;
} }
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len)); const std::string sums_suffix{".sums"};
std::vector<char> name_as_vec(len+1); const std::string counts_suffix{".counts"};
in.read((char *)name_as_vec.data(), len);
if (in.fail()) { // TODO: allow loading from mis-ordered imatrix files
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); for (int32_t i = 0; i < n_entries - 1; i += 2) {
return false; std::string sums_name{gguf_get_tensor_name(ctx_gguf, i + 0)};
} std::string counts_name{gguf_get_tensor_name(ctx_gguf, i + 1)};
name_as_vec[len] = 0;
std::string name{name_as_vec.data()}; if (sums_name.size() < sums_suffix.size() ||
auto & e = m_stats[std::move(name)]; counts_name.size() < counts_suffix.size() ||
int ncall; !std::equal(sums_name.begin(), sums_name.end() - sums_suffix.size(), counts_name.begin()) ||
in.read((char*)&ncall, sizeof(ncall)); !std::equal(sums_suffix.rbegin(), sums_suffix.rend(), sums_name.rbegin()) ||
int nval; !std::equal(counts_suffix.rbegin(), counts_suffix.rend(), counts_name.rbegin())) {
in.read((char *)&nval, sizeof(nval)); fprintf(stderr, "%s: mismatched sums and counts for entry %d\n", __func__, i / 2);
if (in.fail() || nval < 1) { gguf_free(ctx_gguf);
printf("%s: failed reading number of values for entry %d\n",__func__,i); ggml_free(ctx);
m_stats = {};
return false; return false;
} }
struct ggml_tensor * sums = ggml_get_tensor(ctx, sums_name.c_str());
struct ggml_tensor * counts = ggml_get_tensor(ctx, counts_name.c_str());
if (!sums || !counts) {
fprintf(stderr, "%s: failed reading data for entry %d\n", __func__, i / 2);
gguf_free(ctx_gguf);
ggml_free(ctx);
return false;
}
std::string name = sums_name.substr(0, sums_name.size() - sums_suffix.size());
auto & e = m_stats[name];
int32_t nval = ggml_nelements(sums);
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(nval, 0); e.values.resize(nval, 0);
e.counts.resize(nval, 0); }
int32_t ncounts = ggml_nelements(counts);
if (e.counts.empty()) {
e.counts.resize(ncounts, 0);
} else if (e.counts.size() == 1 && ncounts > 1) {
// broadcast, when loading an old imatrix
e.counts.resize(ncounts, e.counts[0]);
} }
std::vector<float> tmp(nval); // Recreate the state as expected by save_imatrix()
in.read((char*)tmp.data(), nval*sizeof(float)); for (int32_t j = 0; j < nval; j++) {
if (in.fail()) { e.values[j] += ((const float *) sums->data)[j];
printf("%s: failed reading data for entry %d\n",__func__,i);
m_stats = {};
return false;
} }
for (int32_t j = 0; j < ncounts; j++) {
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum. e.counts[j] += std::lround(((const float *) counts->data)[j]);
for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
} }
e.ncall += ncall;
} }
gguf_free(ctx_gguf);
ggml_free(ctx);
return true; return true;
} }

View file

@ -6,8 +6,6 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <fstream>
#include <cmath>
struct quant_option { struct quant_option {
std::string name; std::string name;
@ -61,6 +59,11 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
// TODO: share with imatrix.cpp
static const char * const LLM_KV_IMATRIX_DATASET = "imatrix.dataset";
static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str; std::string ftype_str;
@ -121,66 +124,92 @@ static void usage(const char * executable) {
} }
static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) { static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) { struct ggml_context * ctx = nullptr;
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false, // the data is needed
/* .ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
if (!ctx_gguf) {
exit(1); exit(1);
} }
int n_entries; const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
in.read((char *)&n_entries, sizeof(n_entries)); if (n_entries < 2) {
if (in.fail() || n_entries < 1) { fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str());
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); gguf_free(ctx_gguf);
ggml_free(ctx);
exit(1); exit(1);
} }
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len)); const int dataset_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASET);
std::vector<char> name_as_vec(len+1); const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
in.read((char *)name_as_vec.data(), len); const int chunk_size_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
if (in.fail()) { if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) {
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
exit(1);
}
const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
const std::string sums_suffix{".sums"};
const std::string counts_suffix{".counts"};
// TODO: allow loading from mis-ordered imatrix files
for (int32_t i = 0; i < n_entries - 1; i += 2) {
std::string sums_name{gguf_get_tensor_name(ctx_gguf, i + 0)};
std::string counts_name{gguf_get_tensor_name(ctx_gguf, i + 1)};
if (sums_name.size() < sums_suffix.size() ||
counts_name.size() < counts_suffix.size() ||
!std::equal(sums_name.begin(), sums_name.end() - sums_suffix.size(), counts_name.begin()) ||
!std::equal(sums_suffix.rbegin(), sums_suffix.rend(), sums_name.rbegin()) ||
!std::equal(counts_suffix.rbegin(), counts_suffix.rend(), counts_name.rbegin())) {
fprintf(stderr, "%s: mismatched sums and counts for entry %d\n", __func__, i / 2);
gguf_free(ctx_gguf);
ggml_free(ctx);
exit(1); exit(1);
} }
name_as_vec[len] = 0;
std::string name{name_as_vec.data()}; struct ggml_tensor * sums = ggml_get_tensor(ctx, sums_name.c_str());
struct ggml_tensor * counts = ggml_get_tensor(ctx, counts_name.c_str());
if (!sums || !counts) {
fprintf(stderr, "%s: failed reading data for entry %d\n", __func__, i / 2);
gguf_free(ctx_gguf);
ggml_free(ctx);
exit(1);
}
const int64_t ne0 = sums->ne[0];
const int64_t ne1 = sums->ne[1];
std::string name = sums_name.substr(0, sums_name.size() - sums_suffix.size());
auto & e = imatrix_data[name]; auto & e = imatrix_data[name];
int ncall; e.resize(ggml_nelements(sums));
in.read((char *)&ncall, sizeof(ncall)); float max_count = 0.0f;
int nval; for (int64_t j = 0; j < ne1; ++j) {
in.read((char *)&nval, sizeof(nval)); const float count = ((const float *) counts->data)[ne1];
if (in.fail() || nval < 1) { for (int64_t i = 0; i < ne0; ++i) {
printf("%s: failed reading number of values for entry %d\n", __func__, i); e[ne1*ne0 + ne0] = ((const float *) sums->data)[ne1*ne0 + ne0] / count;
imatrix_data = {}; }
exit(1); if (count > max_count) {
max_count = count;
}
} }
e.resize(nval);
in.read((char *)e.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n", __func__, i);
imatrix_data = {};
exit(1);
}
if (ncall > 0) {
for (auto& v : e) v /= ncall;
}
if (getenv("LLAMA_TRACE")) { if (getenv("LLAMA_TRACE")) {
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), int(max_count / chunk_size), name.c_str());
} }
} }
gguf_free(ctx_gguf);
ggml_free(ctx);
// latest imatrix version contains the dataset filename at the end of the file int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx);
int m_last_call = 0; imatrix_dataset = gguf_get_val_str(ctx_gguf, dataset_idx);
if (in.peek() != EOF) {
in.read((char *)&m_last_call, sizeof(m_last_call)); printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
int dataset_len; printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
in.read((char *)&dataset_len, sizeof(dataset_len)); return m_last_chunk;
std::vector<char> dataset_as_vec(dataset_len);
in.read(dataset_as_vec.data(), dataset_len);
imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
}
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
return m_last_call;
} }
static int prepare_imatrix(const std::string & imatrix_file, static int prepare_imatrix(const std::string & imatrix_file,

View file

@ -167,6 +167,12 @@ class Keys:
TYPE = "adapter.type" TYPE = "adapter.type"
LORA_ALPHA = "adapter.lora.alpha" LORA_ALPHA = "adapter.lora.alpha"
class IMatrix:
CHUNK_COUNT = "imatrix.chunk_count"
CHUNK_SIZE = "imatrix.chunk_size"
DATASET = "imatrix.dataset"
# #
# recommended mapping of model tensor names for storage in gguf # recommended mapping of model tensor names for storage in gguf
# #
@ -175,6 +181,7 @@ class Keys:
class GGUFType: class GGUFType:
MODEL = "model" MODEL = "model"
ADAPTER = "adapter" ADAPTER = "adapter"
IMATRIX = "imatrix"
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):