Merge branch 'master' into compilade/imatrix-batched-chunks

common : use GGUF for imatrix output by default
2025-02-09 12:06:24 -05:00 · 2025-02-09 12:06:15 -05:00 · 2025-01-30 19:56:20 -05:00 · 2024-09-10 12:09:17 -04:00 · 2024-09-10 11:51:23 -04:00 · 2024-09-09 22:35:47 -04:00
12 changed files with 514 additions and 248 deletions
--- a/common/chat-template.hpp
+++ b/common/chat-template.hpp
@ -249,30 +249,16 @@ class chat_template {
                    inputs.add_generation_prompt = false;
                    full = apply(inputs);
                }
-                auto eos_pos_last = full.rfind(eos_token_);
-                if (eos_pos_last == prefix.size() - eos_token_.size() ||
-                      (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
-                    full = full.substr(0, eos_pos_last);
-                }
-                size_t common_prefix_length = 0;
-                for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
-                    if (prefix[i] != full[i]) {
-                        break;
+
+                if (full.find(prefix) != 0) {
+                    if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
+                        prefix = prefix.substr(0, prefix.size() - eos_token_.size());
                    }
-                    if (prefix[i] == '<') {
-                        // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
-                        // but it removes thinking tags for past messages.
-                        // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
-                        continue;
-                    }
-                    common_prefix_length = i + 1;
                }
-                auto example = full.substr(common_prefix_length);
-                if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
+                if (full.find(prefix) != 0) {
                    fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
-                } else {
-                    tool_call_example_ = example;
                }
+                tool_call_example_ = full.substr(prefix.size());
            }
        } catch (const std::exception & e) {
            fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@ -377,7 +363,7 @@ class chat_template {
            if (polyfill_tools) {
                adjusted_messages = add_system(inputs.messages,
                    "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
-                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
+                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
            } else {
                adjusted_messages = inputs.messages;
            }
--- a/common/common.h
+++ b/common/common.h
@ -383,7 +383,7 @@ struct common_params {
    int32_t i_pos  = -1;  // position of the passkey in the junk text

    // imatrix params
-    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+    std::string out_file = "imatrix.gguf"; // save the resulting imatrix to this file

    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
--- a/common/minja.hpp
+++ b/common/minja.hpp
@ -1385,13 +1385,6 @@ static std::string strip(const std::string & s) {
  return s.substr(start, end - start + 1);
 }

-static std::string capitalize(const std::string & s) {
-  if (s.empty()) return s;
-  auto result = s;
-  result[0] = std::toupper(result[0]);
-  return result;
-}
-
 static std::string html_escape(const std::string & s) {
  std::string result;
  result.reserve(s.size());
@ -1469,9 +1462,6 @@ public:
          if (method->get_name() == "strip") {
            vargs.expectArgs("strip method", {0, 0}, {0, 0});
            return Value(strip(str));
-          } else if (method->get_name() == "capitalize") {
-            vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
-            return Value(capitalize(str));
          } else if (method->get_name() == "endswith") {
            vargs.expectArgs("endswith method", {1, 1}, {0, 0});
            auto suffix = vargs.args[0].get<std::string>();
@ -1802,7 +1792,7 @@ private:
        auto left = parseStringConcat();
        if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");

-        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
+        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
        static std::regex not_tok(R"(not\b)");
        std::string op_str;
        while (!(op_str = consumeToken(compare_tok)).empty()) {
@ -2181,7 +2171,7 @@ private:
    using TemplateTokenIterator = TemplateTokenVector::const_iterator;

    std::vector<std::string> parseVarNames() {
-      static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
+      static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");

      std::vector<std::string> group;
      if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@ -2204,13 +2194,13 @@ private:
    }

    TemplateTokenVector tokenize() {
-      static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
+      static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
      static std::regex expr_open_regex(R"(\{\{([-~])?)");
-      static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
+      static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
      static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
-      static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
-      static std::regex block_close_regex(R"(\s*([-~])?%\})");
+      static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
+      static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");

      TemplateTokenVector tokens;
      std::vector<std::string> group;
@ -2294,7 +2284,7 @@ private:
              auto post_space = parseBlockClose();
              tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
            } else if (keyword == "set") {
-              static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
+              static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");

              std::string ns;
              std::vector<std::string> var_names;
@ -2346,11 +2336,6 @@ private:
              throw std::runtime_error("Unexpected block: " + keyword);
            }
          } else if (std::regex_search(it, end, match, non_text_open_regex)) {
-            if (!match.position()) {
-                if (match[0] != "{#")
-                    throw std::runtime_error("Internal error: Expected a comment");
-                throw std::runtime_error("Missing end of comment tag");
-            }
            auto text_end = it + match.position();
            text = std::string(it, text_end);
            it = text_end;
@ -2415,7 +2400,7 @@ private:

              auto text = text_token->text;
              if (post_space == SpaceHandling::Strip) {
-                static std::regex trailing_space_regex(R"(\s+$)");
+                static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
                text = std::regex_replace(text, trailing_space_regex, "");
              } else if (options.lstrip_blocks && it != end) {
                auto i = text.size();
@ -2425,7 +2410,7 @@ private:
                }
              }
              if (pre_space == SpaceHandling::Strip) {
-                static std::regex leading_space_regex(R"(^\s+)");
+                static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
                text = std::regex_replace(text, leading_space_regex, "");
              } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
                if (text.length() > 0 && text[0] == '\n') {
--- a/convert_legacy_imatrix_to_gguf.py
+++ b/convert_legacy_imatrix_to_gguf.py
@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import os
+import sys
+import logging
+import argparse
+
+from typing import Any
+from pathlib import Path
+from dataclasses import dataclass
+
+import numpy as np
+import numpy.typing as npt
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+
+logger = logging.getLogger("imatrix-to-gguf")
+
+
+class IMatrixWriter(gguf.GGUFWriter):
+    def add_architecture(self) -> None:
+        # no arch is stored in imatrix files
+        pass
+
+
+@dataclass
+class IMatrixEntry:
+    values: np.ndarray[Any, np.dtype[np.float32]]
+    counts: np.ndarray[Any, np.dtype[np.float32]]
+
+
+class IMatrixReader:
+    chunk_size: int = 512  # guess
+    offset: int = 0
+    data: np.ndarray[Any, np.dtype[np.uint8]]
+    n_enties: int
+    entries: dict[str, IMatrixEntry]
+    chunk_count: int
+    dataset: str
+
+    def _get(self, dtype: npt.DTypeLike, count: int = 1) -> npt.NDArray[Any]:
+        count = int(count)
+        itemsize = int(np.empty([], dtype=dtype).itemsize)
+        offset = self.offset
+        self.offset = offset + itemsize * count
+        return self.data[offset:self.offset].view(dtype=dtype)[:count]
+
+    def __init__(self, imatrix: Path):
+        self.offset = 0
+        self.entries = {}
+        self.data = np.memmap(imatrix)
+        n_entries = self._get(np.int32).item()
+        assert n_entries >= 0
+        for _ in range(n_entries):
+            len = self._get(np.int32).item()
+            name = self._get(np.uint8, len).tobytes().decode("utf-8")
+            ncall = self._get(np.int32).item()
+            nval = self._get(np.int32).item()
+            data = self._get(np.float32, nval)
+            assert name not in self.entries, f"duplicated name: {name!r}"
+
+            self.entries[name] = IMatrixEntry(data * np.float32(self.chunk_size), np.array([ncall * self.chunk_size], dtype=np.float32))
+
+        self.chunk_count = self._get(np.int32).item()
+        dataset_len = self._get(np.int32).item()
+        self.dataset = self._get(np.uint8, dataset_len).tobytes().decode("utf-8")
+
+    def to_writer(self, outfile: Path) -> IMatrixWriter:
+        writer = IMatrixWriter(path=outfile, arch="")
+
+        writer.add_type(gguf.GGUFType.IMATRIX)
+        writer.add_key_value(gguf.Keys.IMatrix.CHUNK_COUNT, self.chunk_count, gguf.GGUFValueType.UINT32)
+        writer.add_key_value(gguf.Keys.IMatrix.CHUNK_SIZE, self.chunk_size, gguf.GGUFValueType.UINT32)
+        writer.add_key_value(gguf.Keys.IMatrix.DATASET, self.dataset, gguf.GGUFValueType.STRING)
+
+        for name, entry in self.entries.items():
+            writer.add_tensor(name + ".sums", entry.values)
+            writer.add_tensor(name + ".counts", entry.counts)
+
+        return writer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Convert an old imatrix.dat file to a GGUF compatible file")
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input.",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="increase output verbosity",
+    )
+    parser.add_argument(
+        "imatrix", type=Path,
+        help="path to an imatrix file",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    if args.outfile is None:
+        input_file: Path = args.imatrix
+        if input_file.suffix != ".gguf":
+            args.outfile = input_file.with_suffix(".gguf")
+            if args.outfile.exists():
+                logger.error(f"default file exists, specify with --outfile to overwrite: {args.outfile}")
+                exit(1)
+
+    writer = IMatrixReader(args.imatrix).to_writer(args.outfile)
+
+    writer.write_header_to_file(args.outfile)
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file()
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "gguf.h"

 #include <cmath>
 #include <cstdio>
@ -10,8 +11,8 @@
 #include <thread>
 #include <mutex>
 #include <vector>
-#include <fstream>
 #include <unordered_map>
+#include <map>
 #include <algorithm>

 #if defined(_MSC_VER)
@ -21,16 +22,27 @@
 static void print_usage(int, char ** argv) {
    LOG("\nexample usage:\n");
    LOG("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
-            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
+            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...]\n" , argv[0]);
    LOG("\n");
 }

+static bool str_remove_suffix(std::string & str, const std::string & suffix) {
+    bool has_suffix = str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), str.size(), suffix) == 0;
+    if (has_suffix) {
+        str = str.substr(0, str.size() - suffix.size());
+    }
+    return has_suffix;
+}
+
+static const char * const LLM_KV_IMATRIX_DATASET     = "imatrix.dataset";
+static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
+static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
+
 struct Stats {
-    std::vector<float> values;
-    std::vector<int> counts;
-    int ncall = 0;
+    std::vector<float>   values;
+    std::vector<int64_t> counts;
 };

 class IMatrixCollector {
@ -38,13 +50,13 @@ public:
    IMatrixCollector() = default;
    void set_params(common_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix(int ncall = -1) const;
-    bool load_imatrix(const char * fname);
+    void save_imatrix(int32_t n_chunk = -1) const;
+    bool load_imatrix(const char * file_name);
 private:
    std::unordered_map<std::string, Stats> m_stats;
    common_params                          m_params;
    std::mutex                             m_mutex;
-    int                                    m_last_call = 0;
+    int32_t                                m_last_chunk = 0;
    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
 };
@ -118,17 +130,23 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

        auto & e = m_stats[wname];

-        ++e.ncall;
-
+        if (e.counts.size() == 1 && n_as > 1) {
+            // broadcast, when loading an old imatrix
+            e.counts.resize(n_as, e.counts[0]);
+        }
        if (e.values.empty()) {
            e.values.resize(src1->ne[0]*n_as, 0);
-            e.counts.resize(src1->ne[0]*n_as, 0);
+            e.counts.resize(n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        else if (e.counts.size() != (size_t)n_as) {
+            LOG_ERR("Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), (int)n_as);
+            exit(1); //GGML_ABORT("fatal error");
+        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
@ -145,24 +163,26 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                    const int64_t i12 = row;
                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);

+                    e.counts[ex]++;
+
                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                        e.values[e_start + j] += x[j]*x[j];
-                        e.counts[e_start + j]++;
-                        if (!std::isfinite(e.values[e_start + j])) {
-                            LOG("\n");
-                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                        e.values[e_start + j] = std::fma(x[j], x[j], e.values[e_start + j]);
+                        if (!std::isfinite((float)e.values[e_start + j])) {
+                            LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
                }
            }
-            if (e.ncall > m_last_call) {
-                m_last_call = e.ncall;
-                if (m_last_call % m_params.n_out_freq == 0) {
+            const int32_t n_chunk = e.counts[ex] / (m_params.n_ctx / m_params.n_parallel);
+            if (n_chunk > m_last_chunk) {
+                const int32_t chunk_step = n_chunk - m_last_chunk;
+                m_last_chunk = n_chunk;
+                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                    save_imatrix();
                }
-                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                    save_imatrix(m_last_call);
+                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                    save_imatrix(m_last_chunk);
                }
            }
        }
@ -170,32 +190,38 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        auto & e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
-            e.counts.resize(src1->ne[0], 0);
+            e.counts.resize(1, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ABORT("fatal error");
        }
-        ++e.ncall;
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        else if (e.counts.size() != 1) {
+            LOG_ERR("Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), 1);
+            exit(1); //GGML_ABORT("fatal error");
+        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        // TODO: higher dimensions
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
            const float * x = data + row * src1->ne[0];
+            e.counts[0]++;
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                e.values[j] += x[j]*x[j];
-                e.counts[j]++;
-                if (!std::isfinite(e.values[j])) {
-                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
+                e.values[j] = std::fma(x[j], x[j], e.values[j]);
+                if (!std::isfinite((float)e.values[j])) {
+                    LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
                    exit(1);
                }
            }
        }
-        if (e.ncall > m_last_call) {
-            m_last_call = e.ncall;
-            if (m_last_call % m_params.n_out_freq == 0) {
+        const int32_t n_chunk = e.counts[0] / (m_params.n_ctx / m_params.n_parallel);
+        if (n_chunk > m_last_chunk) {
+            const int32_t chunk_step = n_chunk - m_last_chunk;
+            m_last_chunk = n_chunk;
+            if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                save_imatrix();
            }
-            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                save_imatrix(m_last_call);
+            if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                save_imatrix(m_last_chunk);
            }
        }
    }
@ -203,22 +229,22 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    return true;
 }

-void IMatrixCollector::save_imatrix(int ncall) const {
+void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
    auto fname = m_params.out_file;
    if (fname.empty()) {
-        fname = "imatrix.dat";
+        fname = "imatrix.gguf";
    }

-    if (ncall > 0) {
+    if (n_chunk > 0) {
        fname += ".at_";
-        fname += std::to_string(ncall);
+        fname += std::to_string(n_chunk);
    }

    // avoid writing imatrix entries that do not have full data
    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data

-    int n_entries = 0;
    std::vector<std::string> to_store;
+    size_t data_size = 0;

    bool is_first = true; // for printing
    for (const auto & kv : m_stats) {
@ -250,101 +276,157 @@ void IMatrixCollector::save_imatrix(int ncall) const {
            continue;
        }

-        n_entries++;
        to_store.push_back(kv.first);
+        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
+        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
    }

    if (to_store.size() < m_stats.size()) {
        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }

-    std::ofstream out(fname, std::ios::binary);
-    out.write((const char *) &n_entries, sizeof(n_entries));
+    // deterministic tensor name order
+    std::sort(to_store.begin(), to_store.end());
+
+    struct ggml_init_params params = {
+        /* .mem_size   = */ data_size,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    struct gguf_context * ctx_gguf = gguf_init_empty();
+
+    gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
+    // Write the input filename to later on specify it in quantize
+    gguf_set_val_str(ctx_gguf, LLM_KV_IMATRIX_DATASET, m_params.prompt_file.c_str());
+    // Write the number of chunks the matrix was computed with
+    gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
+    gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
+
    for (const auto & name : to_store) {
        const auto & stat = m_stats.at(name);
-        int len = name.size();
-        out.write((const char *) &len, sizeof(len));
-        out.write(name.c_str(), len);
-        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
-        int nval = stat.values.size();
-        out.write((const char *) &nval, sizeof(nval));
+        const int32_t nval = (int32_t) stat.values.size();
+        const int32_t nmat = (int32_t) stat.counts.size();
        if (nval > 0) {
-            std::vector<float> tmp(nval);
-            for (int i = 0; i < nval; i++) {
-                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
+            struct ggml_tensor * sums = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
+            struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
+            ggml_format_name(sums, "%s.sums", name.c_str());
+            ggml_format_name(counts, "%s.counts", name.c_str());
+
+            for (int32_t j = 0; j < nval; ++j) {
+                ((float *) sums->data)[j] = (float) stat.values[j];
            }
-            out.write((const char*)tmp.data(), nval*sizeof(float));
+            for (int32_t j = 0; j < nmat; ++j) {
+                ((float *) counts->data)[j] = (float) stat.counts[j];
+            }
+
+            gguf_add_tensor(ctx_gguf, sums);
+            gguf_add_tensor(ctx_gguf, counts);
        }
    }

-    // Write the number of call the matrix was computed with
-    out.write((const char *) &m_last_call, sizeof(m_last_call));
-
-    // Write the input filename at the end of the file to later on specify it in quantize
-    {
-        int len = m_params.prompt_file.size();
-        out.write((const char *) &len, sizeof(len));
-        out.write(m_params.prompt_file.c_str(), len);
-    }
+    gguf_write_to_file(ctx_gguf, fname.c_str(), false);

    LOGV(1, "\n");
-    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
 }

-bool IMatrixCollector::load_imatrix(const char * fname) {
-    std::ifstream in(fname, std::ios::binary);
-    if (!in) {
-        LOG_ERR("%s: failed to open %s\n",__func__, fname);
+bool IMatrixCollector::load_imatrix(const char * file_name) {
+    struct ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false, // the data is needed
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
+    if (!ctx_gguf) {
        return false;
    }
-    int n_entries;
-    in.read((char*)&n_entries, sizeof(n_entries));
-    if (in.fail() || n_entries < 1) {
-        LOG_ERR("%s: no data in file %s\n", __func__, fname);
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
+    if (n_entries < 1) {
+        LOG_ERR("%s: no data in file %s\n", __func__, file_name);
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
        return false;
    }
-    for (int i = 0; i < n_entries; ++i) {
-        int len; in.read((char *)&len, sizeof(len));
-        std::vector<char> name_as_vec(len+1);
-        in.read((char *)name_as_vec.data(), len);
-        if (in.fail()) {
-            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+
+    const std::string sums_suffix{".sums"};
+    const std::string counts_suffix{".counts"};
+
+    // Could re-use m_stats instead, but this allows
+    // checking for completeness of *each* loaded imatrix file
+    // and also makes it easier to re-use a similar implementation in quantize.cpp
+    // Using an ordered map to get a deterministic iteration order.
+    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name = cur->name;
+
+        if (name.empty()) { continue; }
+
+        if (str_remove_suffix(name, sums_suffix)) {
+            // sums
+            sums_counts_for[name].first = cur;
+        } else if (str_remove_suffix(name, counts_suffix)) {
+            // counts
+            sums_counts_for[name].second = cur;
+        } else {
+            LOG_ERR("%s: invalid imatrix tensor name: %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
            return false;
        }
-        name_as_vec[len] = 0;
-        std::string name{name_as_vec.data()};
-        auto & e = m_stats[std::move(name)];
-        int ncall;
-        in.read((char*)&ncall, sizeof(ncall));
-        int nval;
-        in.read((char *)&nval, sizeof(nval));
-        if (in.fail() || nval < 1) {
-            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
-            m_stats = {};
+    }
+
+    for (const auto & sc : sums_counts_for) {
+        const        std::string & name   = sc.first;
+        const struct ggml_tensor * sums   = sc.second.first;
+        const struct ggml_tensor * counts = sc.second.second;
+
+        if (!sums || !counts) {
+            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
            return false;
        }

+        auto & e = m_stats[name];
+
+        int64_t nval = ggml_nelements(sums);
        if (e.values.empty()) {
            e.values.resize(nval, 0);
-            e.counts.resize(nval, 0);
-        }
-
-        std::vector<float> tmp(nval);
-        in.read((char*)tmp.data(), nval*sizeof(float));
-        if (in.fail()) {
-            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
-            m_stats = {};
+        } else if ((size_t) nval != e.values.size()) {
+            LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
            return false;
        }

-        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
-        for (int i = 0; i < nval; i++) {
-            e.values[i] += tmp[i];
-            e.counts[i] += ncall;
+        int64_t ncounts = ggml_nelements(counts);
+        if (e.counts.empty()) {
+            e.counts.resize(ncounts, 0);
+        } else if (e.counts.size() == 1 && ncounts > 1) {
+            // broadcast, when loading an old imatrix
+            e.counts.resize(ncounts, e.counts[0]);
+        } else if ((size_t) ncounts != e.counts.size()) {
+            LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
        }
-        e.ncall += ncall;

+        // Recreate the state as expected by save_imatrix()
+        for (int64_t j = 0; j < nval; j++) {
+            e.values[j] += ((const float *) sums->data)[j];
+        }
+        for (int64_t j = 0; j < ncounts; j++) {
+            e.counts[j] += std::lround(((const float *) counts->data)[j]);
+        }
    }
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
    return true;
 }

@ -427,12 +509,11 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const common_params & params) {
+static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
    const llama_model * model = llama_get_model(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);

    const bool add_bos = llama_vocab_get_add_bos(vocab);
-    const int n_ctx = llama_n_ctx(ctx);

    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));

@ -477,45 +558,61 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
    double nll = 0.0;
    double nll2 = 0.0;

-    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
-
-    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
-
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int n_seq = std::max(1, n_batch / n_ctx);
+
+    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
+    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
+
+    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);

    std::vector<float> logits;
    if (params.compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }

-    for (int i = 0; i < n_chunk; ++i) {
+    LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
+    for (int i = 0; i < n_chunk; i += n_seq) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;

-        std::vector<float> logits;
+        const int n_seq_batch = std::min(n_seq, n_chunk - i);

        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
        llama_kv_cache_clear(ctx);

-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
-
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
-
-            // add BOS token for the first batch of each chunk
-            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_vocab_bos(vocab);
-            }
-
+            // clear the batch
            common_batch_clear(batch);
-            for (int i = 0; i < batch_size; i++) {
-                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                int seq_start = batch_start + seq*n_ctx;
+
+                // save original token and restore it after eval
+                const auto token_org = tokens[seq_start];
+
+                // add BOS token for the first batch of each chunk
+                if (add_bos && j == 0) {
+                    tokens[seq_start] = llama_vocab_bos(vocab);
+                }
+                for (int k = 0; k < batch_size; ++k) {
+                    // NOTE: specifying all logits to get activations for the output.weight tensor
+                    //       and also for the perplexity calculation.
+                    // TODO: only get outputs when (params.process_output || params.compute_ppl)
+                    //       (not possible when this skips FFN computation of the last layer)
+                    common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
+                }
+  
+                // restore the original token in case it was set to BOS
+                tokens[seq_start] = token_org;
            }

            if (llama_decode(ctx, batch)) {
@ -524,23 +621,19 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                return false;
            }

-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
            if (params.compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }

-        llama_batch_free(batch);
-
-        const auto t_end = std::chrono::high_resolution_clock::now();

        if (i == 0) {
+            llama_synchronize(ctx);
+            const auto t_end = std::chrono::high_resolution_clock::now();
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
+            int total_seconds = (int)(t_total * n_chunk / n_seq);
            if (total_seconds >= 60*60) {
                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
@ -550,17 +643,27 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {

        if (params.compute_ppl) {
            const int first = n_ctx/2;
-            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-            count += n_ctx - first - 1;
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);

-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+                llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
+
+                process_logits(n_vocab, all_logits + first*n_vocab,
+                        tokens_data, n_ctx - 1 - first,
+                        workers, nll, nll2,
+                        logit_history.data() + start + seq*n_ctx + first,
+                        prob_history.data()  + start + seq*n_ctx + first);
+
+                count += n_ctx - first - 1;
+
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+            }
            fflush(stdout);

            logits.clear();
        }
    }
+
    LOG("\n");

    if (params.compute_ppl) {
@ -576,6 +679,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
        }
    }

+    llama_batch_free(batch);
+
    return true;
 }

@ -592,7 +697,22 @@ int main(int argc, char ** argv) {

    common_init();

-    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    const int32_t n_ctx = params.n_ctx;
+
+    if (n_ctx <= 0) {
+        LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
+        return 1;
+    }
+
+    {
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_kv = n_seq * n_ctx;
+
+        params.n_parallel = n_seq;
+        params.n_ctx      = n_kv;
+
+        params.n_batch = std::min(params.n_batch, n_kv);
+    }

    g_collector.set_params(params);

@ -648,7 +768,7 @@ int main(int argc, char ** argv) {
        }
        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
    } else {
-        if (!compute_imatrix(ctx, params)) {
+        if (!compute_imatrix(ctx, params, n_ctx)) {
            return 1;
        }
    }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -37,7 +37,7 @@ Once downloaded, place your model in the models folder in llama.cpp.

 ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```bash
-./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```

 ### Windows:
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -1,13 +1,13 @@
 #include "common.h"
 #include "llama.h"
+#include "gguf.h"

 #include <cstdio>
 #include <cstring>
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include <fstream>
-#include <cmath>
+#include <map>

 struct quant_option {
    std::string name;
@ -60,6 +60,11 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";

+// TODO: share with imatrix.cpp
+static const char * const LLM_KV_IMATRIX_DATASET     = "imatrix.dataset";
+static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
+static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
+
 static bool striequals(const char * a, const char * b) {
    while (*a && *b) {
        if (std::tolower(*a) != std::tolower(*b)) {
@ -129,67 +134,114 @@ static void usage(const char * executable) {
    exit(1);
 }

+// TODO: share with implementation in imatrix.cpp
+static bool str_remove_suffix(std::string & str, const std::string & suffix) {
+    bool has_suffix = str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), str.size(), suffix) == 0;
+    if (has_suffix) {
+        str = str.substr(0, str.size() - suffix.size());
+    }
+    return has_suffix;
+}
+
 static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
-    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
-    if (!in) {
-        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
+
+    struct ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false, // the data is needed
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: if this is an older imatrix file, make sure to convert it to the GGUF-based imatrix format\n", __func__);
        exit(1);
    }
-    int n_entries;
-    in.read((char *)&n_entries, sizeof(n_entries));
-    if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
+    if (n_entries < 1) {
+        fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str());
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
        exit(1);
    }
-    for (int i = 0; i < n_entries; ++i) {
-        int len; in.read((char *)&len, sizeof(len));
-        std::vector<char> name_as_vec(len+1);
-        in.read((char *)name_as_vec.data(), len);
-        if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
+
+    const int dataset_idx     = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASET);
+    const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
+    const int chunk_size_idx  = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
+    if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) {
+        fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        exit(1);
+    }
+
+    const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
+
+    const std::string sums_suffix{".sums"};
+    const std::string counts_suffix{".counts"};
+
+    // Using an ordered map to get a deterministic iteration order.
+    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name = cur->name;
+
+        if (name.empty()) { continue; }
+
+        if (str_remove_suffix(name, sums_suffix)) {
+            // sums
+            sums_counts_for[name].first = cur;
+        } else if (str_remove_suffix(name, counts_suffix)) {
+            // counts
+            sums_counts_for[name].second = cur;
+        } else {
+            fprintf(stderr, "%s: invalid imatrix tensor name: %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
            exit(1);
        }
-        name_as_vec[len] = 0;
-        std::string name{name_as_vec.data()};
+    }
+
+    for (const auto & sc : sums_counts_for) {
+        const        std::string & name   = sc.first;
+        const struct ggml_tensor * sums   = sc.second.first;
+        const struct ggml_tensor * counts = sc.second.second;
+
+        if (!sums || !counts) {
+            fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            exit(1);
+        }
+
+        const int64_t ne0 = sums->ne[0];
+        const int64_t ne1 = sums->ne[1];
+
        auto & e = imatrix_data[name];
-        int ncall;
-        in.read((char *)&ncall, sizeof(ncall));
-        int nval;
-        in.read((char *)&nval, sizeof(nval));
-        if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n", __func__, i);
-            imatrix_data = {};
-            exit(1);
+        e.resize(ggml_nelements(sums));
+        float max_count = 0.0f;
+        for (int64_t j = 0; j < ne1; ++j) {
+            const float count = ((const float *) counts->data)[j];
+            for (int64_t i = 0; i < ne0; ++i) {
+                e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
+            }
+            if (count > max_count) {
+                max_count = count;
+            }
        }
-        e.resize(nval);
-        in.read((char *)e.data(), nval*sizeof(float));
-        if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n", __func__, i);
-            imatrix_data = {};
-            exit(1);
-        }
-        if (ncall > 0) {
-            for (auto& v : e) v /= ncall;
-        }
-
        if (getenv("LLAMA_TRACE")) {
-            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
+            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
        }
    }

-    // latest imatrix version contains the dataset filename at the end of the file
-    int m_last_call = 0;
-    if (in.peek() != EOF) {
-        in.read((char *)&m_last_call, sizeof(m_last_call));
-        int dataset_len;
-        in.read((char *)&dataset_len, sizeof(dataset_len));
-        std::vector<char> dataset_as_vec(dataset_len);
-        in.read(dataset_as_vec.data(), dataset_len);
-        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
-        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
-    }
-    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
-    return m_last_call;
+    int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx);
+    imatrix_dataset = gguf_get_val_str(ctx_gguf, dataset_idx);
+
+    printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
+    return m_last_chunk;
 }

 static int prepare_imatrix(const std::string & imatrix_file,
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -10,6 +10,8 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16

+GGML_BACKEND_API void ggml_vk_instance_init(void);
+
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);

--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -167,7 +167,6 @@ struct vk_device_struct {
    uint32_t subgroup_size;
    uint32_t shader_core_count;
    bool uma;
-    bool prefer_host_memory;
    bool float_controls_rte_fp16;

    bool subgroup_size_control;
@ -1295,9 +1294,7 @@ static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk:
 static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
    vk_buffer buf;
    try {
-        if (device->prefer_host_memory) {
-            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
-        } else if (device->uma) {
+        if (device->uma) {
            // Fall back to host memory type
            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
        } else {
@ -2202,9 +2199,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
        device->physical_device = physical_devices[dev_num];
        const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();

-        const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
-        device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
-
        bool fp16_storage = false;
        bool fp16_compute = false;
        bool maintenance4_support = false;
@ -2793,12 +2787,14 @@ static void ggml_vk_print_gpu_info(size_t idx) {
 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);

-static void ggml_vk_instance_init() {
+void ggml_vk_instance_init() {
    if (vk_instance_initialized) {
        return;
    }
    VK_LOG_DEBUG("ggml_vk_instance_init()");

+    vk_instance_initialized = true;
+
    uint32_t api_version = vk::enumerateInstanceVersion();

    if (api_version < VK_API_VERSION_1_2) {
@ -2849,7 +2845,6 @@ static void ggml_vk_instance_init() {
        GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
    }
    vk_instance.instance = vk::createInstance(instance_create_info);
-    vk_instance_initialized = true;

    size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();

@ -2874,7 +2869,7 @@ static void ggml_vk_instance_init() {
        // Make sure at least one device exists
        if (devices.empty()) {
            std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
-            return;
+            GGML_ABORT("fatal error");
        }

        // Default to using all dedicated GPUs
@ -8349,13 +8344,8 @@ ggml_backend_reg_t ggml_backend_vk_reg() {
        /* .iface       = */ ggml_backend_vk_reg_i,
        /* .context     = */ nullptr,
    };
-    try {
-        ggml_vk_instance_init();
-        return &reg;
-    } catch (const vk::SystemError& e) {
-        VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: System error: " << e.what());
-        return nullptr;
-    }
+
+    return &reg;
 }

 // Extension availability
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -211,6 +211,12 @@ class Keys:
        TYPE       = "adapter.type"
        LORA_ALPHA = "adapter.lora.alpha"

+    class IMatrix:
+        CHUNK_COUNT = "imatrix.chunk_count"
+        CHUNK_SIZE  = "imatrix.chunk_size"
+        DATASET     = "imatrix.dataset"
+
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@ -219,6 +225,7 @@ class Keys:
 class GGUFType:
    MODEL   = "model"
    ADAPTER = "adapter"
+    IMATRIX = "imatrix"


 class MODEL_ARCH(IntEnum):
--- a/requirements.txt
+++ b/requirements.txt
@ -8,5 +8,6 @@

 -r ./requirements/requirements-convert_hf_to_gguf.txt
 -r ./requirements/requirements-convert_hf_to_gguf_update.txt
+-r ./requirements/requirements-convert_legacy_imatrix_to_gguf.txt
 -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
 -r ./requirements/requirements-convert_lora_to_gguf.txt
--- a/requirements/requirements-convert_legacy_imatrix_to_gguf.txt
+++ b/requirements/requirements-convert_legacy_imatrix_to_gguf.txt
@ -0,0 +1 @@
+-r ./requirements-convert_legacy_llama.txt
Author	SHA1	Message	Date
Francis Couture-Harpin	1be357d990	Merge branch 'master' into compilade/imatrix-batched-chunks	2025-02-09 12:06:24 -05:00
Francis Couture-Harpin	db502ddd0e	Merge branch 'master' into compilade/imatrix-batched-chunks	2025-02-09 12:06:15 -05:00
Francis Couture-Harpin	c7a32e761d	common : use GGUF for imatrix output by default	2025-01-30 19:56:20 -05:00
Francis Couture-Harpin	2d79a7077c	quantize : use unused imatrix chunk_size with LLAMA_TRACE	2024-09-10 12:09:17 -04:00
Francis Couture-Harpin	8c13e16bb0	imatrix : allow loading mis-ordered tensors Sums and counts tensors no longer need to be consecutive. * imatrix : more sanity checks when loading multiple imatrix files * imatrix : use ggml_format_name instead of std::string concatenation Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2024-09-10 11:51:23 -04:00
Francis Couture-Harpin	2217247051	imatrix : remove unused n_entries	2024-09-09 22:35:47 -04:00
Francis Couture-Harpin	efa9186dc8	imatrix : avoid using designated initializers in C++	2024-09-09 22:33:10 -04:00
Francis Couture-Harpin	894ed8d7b6	py : include imatrix converter requirements in toplevel requirements	2024-09-09 22:20:18 -04:00
Francis Couture-Harpin	9e6b0e9419	perplexity : revert changes	2024-09-09 22:00:37 -04:00
Francis Couture-Harpin	503630e88a	py : add requirements for legacy imatrix convert script	2024-09-09 21:56:04 -04:00
Francis Couture-Harpin	d19101c9a0	imatrix : use FMA and sort tensor names	2024-09-08 11:03:59 -04:00
Francis Couture-Harpin	3ad0603c65	Merge branch 'master' into compilade/imatrix-batched-chunks	2024-09-08 10:05:08 -04:00
Francis Couture-Harpin	c8ab6a3ba3	imatrix : fix conversion problems	2024-09-08 10:04:01 -04:00
Francis Couture-Harpin	3de9300c37	imatrix : use GGUF to store imatrix data	2024-09-06 17:17:25 -04:00
Francis Couture-Harpin	347247a24e	imatrix : fix segfault when using a single chunk per batch	2024-08-20 15:35:56 -04:00
Francis Couture-Harpin	bce54642c8	imatrix : allow processing multiple chunks per batch * perplexity : simplify filling the batch	2024-08-20 15:17:24 -04:00