Compare commits

..

4 commits

Author SHA1 Message Date
Olivier Chafik
d7b31a9d84
sync: minja (a72057e519) (#11774) 2025-02-10 09:34:09 +00:00
pascal-lc
9ac3457b39
Update README.md [no ci] (#11781)
typo: `\` -> `/`
Change the UNIX path separator to` \`.
2025-02-10 09:05:57 +01:00
Danny Milosavljevic
c2a67efe38
vulkan: Make Vulkan optional at runtime (#11493). (#11494)
Co-authored-by: Jeff Bolz <jbolz@nvidia.com>
2025-02-10 07:17:21 +01:00
Wagner Bruna
b044a0fe3c
vulkan: add environment variable GGML_VK_PREFER_HOST_MEMORY to avoid VRAM allocation (#11592) 2025-02-10 07:08:22 +01:00
12 changed files with 248 additions and 514 deletions

View file

@ -249,16 +249,30 @@ class chat_template {
inputs.add_generation_prompt = false;
full = apply(inputs);
}
if (full.find(prefix) != 0) {
if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
prefix = prefix.substr(0, prefix.size() - eos_token_.size());
auto eos_pos_last = full.rfind(eos_token_);
if (eos_pos_last == prefix.size() - eos_token_.size() ||
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
full = full.substr(0, eos_pos_last);
}
size_t common_prefix_length = 0;
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
if (prefix[i] != full[i]) {
break;
}
if (prefix[i] == '<') {
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
// but it removes thinking tags for past messages.
// The prefix and full strings diverge at <think> vs. <tool▁calls▁begin>, we avoid consuming the leading <.
continue;
}
common_prefix_length = i + 1;
}
if (full.find(prefix) != 0) {
auto example = full.substr(common_prefix_length);
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
} else {
tool_call_example_ = example;
}
tool_call_example_ = full.substr(prefix.size());
}
} catch (const std::exception & e) {
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@ -363,7 +377,7 @@ class chat_template {
if (polyfill_tools) {
adjusted_messages = add_system(inputs.messages,
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
} else {
adjusted_messages = inputs.messages;
}

View file

@ -383,7 +383,7 @@ struct common_params {
int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.gguf"; // save the resulting imatrix to this file
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations

View file

@ -1385,6 +1385,13 @@ static std::string strip(const std::string & s) {
return s.substr(start, end - start + 1);
}
static std::string capitalize(const std::string & s) {
if (s.empty()) return s;
auto result = s;
result[0] = std::toupper(result[0]);
return result;
}
static std::string html_escape(const std::string & s) {
std::string result;
result.reserve(s.size());
@ -1462,6 +1469,9 @@ public:
if (method->get_name() == "strip") {
vargs.expectArgs("strip method", {0, 0}, {0, 0});
return Value(strip(str));
} else if (method->get_name() == "capitalize") {
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
return Value(capitalize(str));
} else if (method->get_name() == "endswith") {
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
auto suffix = vargs.args[0].get<std::string>();
@ -1792,7 +1802,7 @@ private:
auto left = parseStringConcat();
if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
static std::regex not_tok(R"(not\b)");
std::string op_str;
while (!(op_str = consumeToken(compare_tok)).empty()) {
@ -2171,7 +2181,7 @@ private:
using TemplateTokenIterator = TemplateTokenVector::const_iterator;
std::vector<std::string> parseVarNames() {
static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");
static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
std::vector<std::string> group;
if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@ -2194,13 +2204,13 @@ private:
}
TemplateTokenVector tokenize() {
static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
static std::regex expr_open_regex(R"(\{\{([-~])?)");
static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
static std::regex block_close_regex(R"(\s*([-~])?%\})");
TemplateTokenVector tokens;
std::vector<std::string> group;
@ -2284,7 +2294,7 @@ private:
auto post_space = parseBlockClose();
tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
} else if (keyword == "set") {
static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
std::string ns;
std::vector<std::string> var_names;
@ -2336,6 +2346,11 @@ private:
throw std::runtime_error("Unexpected block: " + keyword);
}
} else if (std::regex_search(it, end, match, non_text_open_regex)) {
if (!match.position()) {
if (match[0] != "{#")
throw std::runtime_error("Internal error: Expected a comment");
throw std::runtime_error("Missing end of comment tag");
}
auto text_end = it + match.position();
text = std::string(it, text_end);
it = text_end;
@ -2400,7 +2415,7 @@ private:
auto text = text_token->text;
if (post_space == SpaceHandling::Strip) {
static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
static std::regex trailing_space_regex(R"(\s+$)");
text = std::regex_replace(text, trailing_space_regex, "");
} else if (options.lstrip_blocks && it != end) {
auto i = text.size();
@ -2410,7 +2425,7 @@ private:
}
}
if (pre_space == SpaceHandling::Strip) {
static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
static std::regex leading_space_regex(R"(^\s+)");
text = std::regex_replace(text, leading_space_regex, "");
} else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
if (text.length() > 0 && text[0] == '\n') {

View file

@ -1,122 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import os
import sys
import logging
import argparse
from typing import Any
from pathlib import Path
from dataclasses import dataclass
import numpy as np
import numpy.typing as npt
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
logger = logging.getLogger("imatrix-to-gguf")
class IMatrixWriter(gguf.GGUFWriter):
def add_architecture(self) -> None:
# no arch is stored in imatrix files
pass
@dataclass
class IMatrixEntry:
values: np.ndarray[Any, np.dtype[np.float32]]
counts: np.ndarray[Any, np.dtype[np.float32]]
class IMatrixReader:
chunk_size: int = 512 # guess
offset: int = 0
data: np.ndarray[Any, np.dtype[np.uint8]]
n_enties: int
entries: dict[str, IMatrixEntry]
chunk_count: int
dataset: str
def _get(self, dtype: npt.DTypeLike, count: int = 1) -> npt.NDArray[Any]:
count = int(count)
itemsize = int(np.empty([], dtype=dtype).itemsize)
offset = self.offset
self.offset = offset + itemsize * count
return self.data[offset:self.offset].view(dtype=dtype)[:count]
def __init__(self, imatrix: Path):
self.offset = 0
self.entries = {}
self.data = np.memmap(imatrix)
n_entries = self._get(np.int32).item()
assert n_entries >= 0
for _ in range(n_entries):
len = self._get(np.int32).item()
name = self._get(np.uint8, len).tobytes().decode("utf-8")
ncall = self._get(np.int32).item()
nval = self._get(np.int32).item()
data = self._get(np.float32, nval)
assert name not in self.entries, f"duplicated name: {name!r}"
self.entries[name] = IMatrixEntry(data * np.float32(self.chunk_size), np.array([ncall * self.chunk_size], dtype=np.float32))
self.chunk_count = self._get(np.int32).item()
dataset_len = self._get(np.int32).item()
self.dataset = self._get(np.uint8, dataset_len).tobytes().decode("utf-8")
def to_writer(self, outfile: Path) -> IMatrixWriter:
writer = IMatrixWriter(path=outfile, arch="")
writer.add_type(gguf.GGUFType.IMATRIX)
writer.add_key_value(gguf.Keys.IMatrix.CHUNK_COUNT, self.chunk_count, gguf.GGUFValueType.UINT32)
writer.add_key_value(gguf.Keys.IMatrix.CHUNK_SIZE, self.chunk_size, gguf.GGUFValueType.UINT32)
writer.add_key_value(gguf.Keys.IMatrix.DATASET, self.dataset, gguf.GGUFValueType.STRING)
for name, entry in self.entries.items():
writer.add_tensor(name + ".sums", entry.values)
writer.add_tensor(name + ".counts", entry.counts)
return writer
def parse_args():
parser = argparse.ArgumentParser(
description="Convert an old imatrix.dat file to a GGUF compatible file")
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input.",
)
parser.add_argument(
"--verbose", action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"imatrix", type=Path,
help="path to an imatrix file",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.outfile is None:
input_file: Path = args.imatrix
if input_file.suffix != ".gguf":
args.outfile = input_file.with_suffix(".gguf")
if args.outfile.exists():
logger.error(f"default file exists, specify with --outfile to overwrite: {args.outfile}")
exit(1)
writer = IMatrixReader(args.imatrix).to_writer(args.outfile)
writer.write_header_to_file(args.outfile)
writer.write_kv_data_to_file()
writer.write_tensors_to_file()

View file

@ -2,7 +2,6 @@
#include "common.h"
#include "log.h"
#include "llama.h"
#include "gguf.h"
#include <cmath>
#include <cstdio>
@ -11,8 +10,8 @@
#include <thread>
#include <mutex>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <map>
#include <algorithm>
#if defined(_MSC_VER)
@ -22,27 +21,16 @@
static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...]\n" , argv[0]);
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG("\n");
}
static bool str_remove_suffix(std::string & str, const std::string & suffix) {
bool has_suffix = str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), str.size(), suffix) == 0;
if (has_suffix) {
str = str.substr(0, str.size() - suffix.size());
}
return has_suffix;
}
static const char * const LLM_KV_IMATRIX_DATASET = "imatrix.dataset";
static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
struct Stats {
std::vector<float> values;
std::vector<int64_t> counts;
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
};
class IMatrixCollector {
@ -50,13 +38,13 @@ public:
IMatrixCollector() = default;
void set_params(common_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int32_t n_chunk = -1) const;
bool load_imatrix(const char * file_name);
void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * fname);
private:
std::unordered_map<std::string, Stats> m_stats;
common_params m_params;
std::mutex m_mutex;
int32_t m_last_chunk = 0;
int m_last_call = 0;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
};
@ -130,23 +118,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto & e = m_stats[wname];
if (e.counts.size() == 1 && n_as > 1) {
// broadcast, when loading an old imatrix
e.counts.resize(n_as, e.counts[0]);
}
++e.ncall;
if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ABORT("fatal error");
}
else if (e.counts.size() != (size_t)n_as) {
LOG_ERR("Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), (int)n_as);
exit(1); //GGML_ABORT("fatal error");
}
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
// loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
@ -163,26 +145,24 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const int64_t i12 = row;
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
e.counts[ex]++;
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] = std::fma(x[j], x[j], e.values[e_start + j]);
if (!std::isfinite((float)e.values[e_start + j])) {
LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
LOG("\n");
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
}
}
}
const int32_t n_chunk = e.counts[ex] / (m_params.n_ctx / m_params.n_parallel);
if (n_chunk > m_last_chunk) {
const int32_t chunk_step = n_chunk - m_last_chunk;
m_last_chunk = n_chunk;
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
save_imatrix(m_last_chunk);
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
@ -190,38 +170,32 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto & e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(1, 0);
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ABORT("fatal error");
}
else if (e.counts.size() != 1) {
LOG_ERR("Oops: inconsistent expert count for %s (%d vs %d)\n", wname.c_str(), (int)e.counts.size(), 1);
exit(1); //GGML_ABORT("fatal error");
}
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
// TODO: higher dimensions
++e.ncall;
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0];
e.counts[0]++;
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] = std::fma(x[j], x[j], e.values[j]);
if (!std::isfinite((float)e.values[j])) {
LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
}
}
const int32_t n_chunk = e.counts[0] / (m_params.n_ctx / m_params.n_parallel);
if (n_chunk > m_last_chunk) {
const int32_t chunk_step = n_chunk - m_last_chunk;
m_last_chunk = n_chunk;
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
save_imatrix(m_last_chunk);
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
@ -229,22 +203,22 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true;
}
void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
void IMatrixCollector::save_imatrix(int ncall) const {
auto fname = m_params.out_file;
if (fname.empty()) {
fname = "imatrix.gguf";
fname = "imatrix.dat";
}
if (n_chunk > 0) {
if (ncall > 0) {
fname += ".at_";
fname += std::to_string(n_chunk);
fname += std::to_string(ncall);
}
// avoid writing imatrix entries that do not have full data
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
int n_entries = 0;
std::vector<std::string> to_store;
size_t data_size = 0;
bool is_first = true; // for printing
for (const auto & kv : m_stats) {
@ -276,157 +250,101 @@ void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
continue;
}
n_entries++;
to_store.push_back(kv.first);
data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
}
if (to_store.size() < m_stats.size()) {
LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
// deterministic tensor name order
std::sort(to_store.begin(), to_store.end());
struct ggml_init_params params = {
/* .mem_size = */ data_size,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
struct gguf_context * ctx_gguf = gguf_init_empty();
gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
// Write the input filename to later on specify it in quantize
gguf_set_val_str(ctx_gguf, LLM_KV_IMATRIX_DATASET, m_params.prompt_file.c_str());
// Write the number of chunks the matrix was computed with
gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
std::ofstream out(fname, std::ios::binary);
out.write((const char *) &n_entries, sizeof(n_entries));
for (const auto & name : to_store) {
const auto & stat = m_stats.at(name);
const int32_t nval = (int32_t) stat.values.size();
const int32_t nmat = (int32_t) stat.counts.size();
int len = name.size();
out.write((const char *) &len, sizeof(len));
out.write(name.c_str(), len);
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
int nval = stat.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) {
struct ggml_tensor * sums = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
ggml_format_name(sums, "%s.sums", name.c_str());
ggml_format_name(counts, "%s.counts", name.c_str());
for (int32_t j = 0; j < nval; ++j) {
((float *) sums->data)[j] = (float) stat.values[j];
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
}
for (int32_t j = 0; j < nmat; ++j) {
((float *) counts->data)[j] = (float) stat.counts[j];
}
gguf_add_tensor(ctx_gguf, sums);
gguf_add_tensor(ctx_gguf, counts);
out.write((const char*)tmp.data(), nval*sizeof(float));
}
}
gguf_write_to_file(ctx_gguf, fname.c_str(), false);
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the input filename at the end of the file to later on specify it in quantize
{
int len = m_params.prompt_file.size();
out.write((const char *) &len, sizeof(len));
out.write(m_params.prompt_file.c_str(), len);
}
LOGV(1, "\n");
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
bool IMatrixCollector::load_imatrix(const char * file_name) {
struct ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false, // the data is needed
/* .ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
if (!ctx_gguf) {
bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
LOG_ERR("%s: failed to open %s\n",__func__, fname);
return false;
}
const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
if (n_entries < 1) {
LOG_ERR("%s: no data in file %s\n", __func__, file_name);
gguf_free(ctx_gguf);
ggml_free(ctx);
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
LOG_ERR("%s: no data in file %s\n", __func__, fname);
return false;
}
const std::string sums_suffix{".sums"};
const std::string counts_suffix{".counts"};
// Could re-use m_stats instead, but this allows
// checking for completeness of *each* loaded imatrix file
// and also makes it easier to re-use a similar implementation in quantize.cpp
// Using an ordered map to get a deterministic iteration order.
std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string name = cur->name;
if (name.empty()) { continue; }
if (str_remove_suffix(name, sums_suffix)) {
// sums
sums_counts_for[name].first = cur;
} else if (str_remove_suffix(name, counts_suffix)) {
// counts
sums_counts_for[name].second = cur;
} else {
LOG_ERR("%s: invalid imatrix tensor name: %s\n", __func__, name.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len));
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
}
for (const auto & sc : sums_counts_for) {
const std::string & name = sc.first;
const struct ggml_tensor * sums = sc.second.first;
const struct ggml_tensor * counts = sc.second.second;
if (!sums || !counts) {
LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto & e = m_stats[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
m_stats = {};
return false;
}
auto & e = m_stats[name];
int64_t nval = ggml_nelements(sums);
if (e.values.empty()) {
e.values.resize(nval, 0);
} else if ((size_t) nval != e.values.size()) {
LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
gguf_free(ctx_gguf);
ggml_free(ctx);
e.counts.resize(nval, 0);
}
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
m_stats = {};
return false;
}
int64_t ncounts = ggml_nelements(counts);
if (e.counts.empty()) {
e.counts.resize(ncounts, 0);
} else if (e.counts.size() == 1 && ncounts > 1) {
// broadcast, when loading an old imatrix
e.counts.resize(ncounts, e.counts[0]);
} else if ((size_t) ncounts != e.counts.size()) {
LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
gguf_free(ctx_gguf);
ggml_free(ctx);
return false;
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
}
e.ncall += ncall;
// Recreate the state as expected by save_imatrix()
for (int64_t j = 0; j < nval; j++) {
e.values[j] += ((const float *) sums->data)[j];
}
for (int64_t j = 0; j < ncounts; j++) {
e.counts[j] += std::lround(((const float *) counts->data)[j]);
}
}
gguf_free(ctx_gguf);
ggml_free(ctx);
return true;
}
@ -509,11 +427,12 @@ static void process_logits(
}
}
static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
const int n_ctx = llama_n_ctx(ctx);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
@ -558,61 +477,45 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c
double nll = 0.0;
double nll2 = 0.0;
LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
const int n_seq = std::max(1, n_batch / n_ctx);
GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
std::vector<float> logits;
if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
for (int i = 0; i < n_chunk; i += n_seq) {
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int n_seq_batch = std::min(n_seq, n_chunk - i);
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
// clear the batch
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_vocab_bos(vocab);
}
common_batch_clear(batch);
for (int seq = 0; seq < n_seq_batch; seq++) {
int seq_start = batch_start + seq*n_ctx;
// save original token and restore it after eval
const auto token_org = tokens[seq_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[seq_start] = llama_vocab_bos(vocab);
}
for (int k = 0; k < batch_size; ++k) {
// NOTE: specifying all logits to get activations for the output.weight tensor
// and also for the perplexity calculation.
// TODO: only get outputs when (params.process_output || params.compute_ppl)
// (not possible when this skips FFN computation of the last layer)
common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
}
// restore the original token in case it was set to BOS
tokens[seq_start] = token_org;
for (int i = 0; i < batch_size; i++) {
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
}
if (llama_decode(ctx, batch)) {
@ -621,19 +524,23 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c
return false;
}
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
llama_batch_free(batch);
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk / n_seq);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
@ -643,27 +550,17 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c
if (params.compute_ppl) {
const int first = n_ctx/2;
for (int seq = 0; seq < n_seq_batch; seq++) {
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
process_logits(n_vocab, all_logits + first*n_vocab,
tokens_data, n_ctx - 1 - first,
workers, nll, nll2,
logit_history.data() + start + seq*n_ctx + first,
prob_history.data() + start + seq*n_ctx + first);
count += n_ctx - first - 1;
LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
}
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
logits.clear();
}
}
LOG("\n");
if (params.compute_ppl) {
@ -679,8 +576,6 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c
}
}
llama_batch_free(batch);
return true;
}
@ -697,22 +592,7 @@ int main(int argc, char ** argv) {
common_init();
const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) {
LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
return 1;
}
{
const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
const int32_t n_kv = n_seq * n_ctx;
params.n_parallel = n_seq;
params.n_ctx = n_kv;
params.n_batch = std::min(params.n_batch, n_kv);
}
params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params);
@ -768,7 +648,7 @@ int main(int argc, char ** argv) {
}
LOG_INF("No prompt provided; combining precomputed matrices only.\n");
} else {
if (!compute_imatrix(ctx, params, n_ctx)) {
if (!compute_imatrix(ctx, params)) {
return 1;
}
}

View file

@ -37,7 +37,7 @@ Once downloaded, place your model in the models folder in llama.cpp.
##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
```bash
./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
```
### Windows:

View file

@ -1,13 +1,13 @@
#include "common.h"
#include "llama.h"
#include "gguf.h"
#include <cstdio>
#include <cstring>
#include <vector>
#include <string>
#include <unordered_map>
#include <map>
#include <fstream>
#include <cmath>
struct quant_option {
std::string name;
@ -60,11 +60,6 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
// TODO: share with imatrix.cpp
static const char * const LLM_KV_IMATRIX_DATASET = "imatrix.dataset";
static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
static bool striequals(const char * a, const char * b) {
while (*a && *b) {
if (std::tolower(*a) != std::tolower(*b)) {
@ -134,114 +129,67 @@ static void usage(const char * executable) {
exit(1);
}
// TODO: share with implementation in imatrix.cpp
static bool str_remove_suffix(std::string & str, const std::string & suffix) {
bool has_suffix = str.size() >= suffix.size() && str.compare(str.size() - suffix.size(), str.size(), suffix) == 0;
if (has_suffix) {
str = str.substr(0, str.size() - suffix.size());
}
return has_suffix;
}
static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
struct ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false, // the data is needed
/* .ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
if (!ctx_gguf) {
fprintf(stderr, "%s: if this is an older imatrix file, make sure to convert it to the GGUF-based imatrix format\n", __func__);
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
exit(1);
}
const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
if (n_entries < 1) {
fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
int n_entries;
in.read((char *)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
exit(1);
}
const int dataset_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASET);
const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
const int chunk_size_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) {
fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
exit(1);
}
const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
const std::string sums_suffix{".sums"};
const std::string counts_suffix{".counts"};
// Using an ordered map to get a deterministic iteration order.
std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string name = cur->name;
if (name.empty()) { continue; }
if (str_remove_suffix(name, sums_suffix)) {
// sums
sums_counts_for[name].first = cur;
} else if (str_remove_suffix(name, counts_suffix)) {
// counts
sums_counts_for[name].second = cur;
} else {
fprintf(stderr, "%s: invalid imatrix tensor name: %s\n", __func__, name.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len));
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
exit(1);
}
}
for (const auto & sc : sums_counts_for) {
const std::string & name = sc.first;
const struct ggml_tensor * sums = sc.second.first;
const struct ggml_tensor * counts = sc.second.second;
if (!sums || !counts) {
fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
gguf_free(ctx_gguf);
ggml_free(ctx);
exit(1);
}
const int64_t ne0 = sums->ne[0];
const int64_t ne1 = sums->ne[1];
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto & e = imatrix_data[name];
e.resize(ggml_nelements(sums));
float max_count = 0.0f;
for (int64_t j = 0; j < ne1; ++j) {
const float count = ((const float *) counts->data)[j];
for (int64_t i = 0; i < ne0; ++i) {
e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
}
if (count > max_count) {
max_count = count;
}
int ncall;
in.read((char *)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n", __func__, i);
imatrix_data = {};
exit(1);
}
e.resize(nval);
in.read((char *)e.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n", __func__, i);
imatrix_data = {};
exit(1);
}
if (ncall > 0) {
for (auto& v : e) v /= ncall;
}
if (getenv("LLAMA_TRACE")) {
printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
}
}
int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx);
imatrix_dataset = gguf_get_val_str(ctx_gguf, dataset_idx);
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
gguf_free(ctx_gguf);
ggml_free(ctx);
return m_last_chunk;
// latest imatrix version contains the dataset filename at the end of the file
int m_last_call = 0;
if (in.peek() != EOF) {
in.read((char *)&m_last_call, sizeof(m_last_call));
int dataset_len;
in.read((char *)&dataset_len, sizeof(dataset_len));
std::vector<char> dataset_as_vec(dataset_len);
in.read(dataset_as_vec.data(), dataset_len);
imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
}
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
return m_last_call;
}
static int prepare_imatrix(const std::string & imatrix_file,

View file

@ -10,8 +10,6 @@ extern "C" {
#define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16
GGML_BACKEND_API void ggml_vk_instance_init(void);
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);

View file

@ -167,6 +167,7 @@ struct vk_device_struct {
uint32_t subgroup_size;
uint32_t shader_core_count;
bool uma;
bool prefer_host_memory;
bool float_controls_rte_fp16;
bool subgroup_size_control;
@ -1294,7 +1295,9 @@ static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk:
static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
vk_buffer buf;
try {
if (device->uma) {
if (device->prefer_host_memory) {
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
} else if (device->uma) {
// Fall back to host memory type
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
} else {
@ -2199,6 +2202,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->physical_device = physical_devices[dev_num];
const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
bool fp16_storage = false;
bool fp16_compute = false;
bool maintenance4_support = false;
@ -2787,14 +2793,12 @@ static void ggml_vk_print_gpu_info(size_t idx) {
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
void ggml_vk_instance_init() {
static void ggml_vk_instance_init() {
if (vk_instance_initialized) {
return;
}
VK_LOG_DEBUG("ggml_vk_instance_init()");
vk_instance_initialized = true;
uint32_t api_version = vk::enumerateInstanceVersion();
if (api_version < VK_API_VERSION_1_2) {
@ -2845,6 +2849,7 @@ void ggml_vk_instance_init() {
GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
}
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
@ -2869,7 +2874,7 @@ void ggml_vk_instance_init() {
// Make sure at least one device exists
if (devices.empty()) {
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
GGML_ABORT("fatal error");
return;
}
// Default to using all dedicated GPUs
@ -8344,8 +8349,13 @@ ggml_backend_reg_t ggml_backend_vk_reg() {
/* .iface = */ ggml_backend_vk_reg_i,
/* .context = */ nullptr,
};
return &reg;
try {
ggml_vk_instance_init();
return &reg;
} catch (const vk::SystemError& e) {
VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: System error: " << e.what());
return nullptr;
}
}
// Extension availability

View file

@ -211,12 +211,6 @@ class Keys:
TYPE = "adapter.type"
LORA_ALPHA = "adapter.lora.alpha"
class IMatrix:
CHUNK_COUNT = "imatrix.chunk_count"
CHUNK_SIZE = "imatrix.chunk_size"
DATASET = "imatrix.dataset"
#
# recommended mapping of model tensor names for storage in gguf
#
@ -225,7 +219,6 @@ class Keys:
class GGUFType:
MODEL = "model"
ADAPTER = "adapter"
IMATRIX = "imatrix"
class MODEL_ARCH(IntEnum):

View file

@ -8,6 +8,5 @@
-r ./requirements/requirements-convert_hf_to_gguf.txt
-r ./requirements/requirements-convert_hf_to_gguf_update.txt
-r ./requirements/requirements-convert_legacy_imatrix_to_gguf.txt
-r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
-r ./requirements/requirements-convert_lora_to_gguf.txt

View file

@ -1 +0,0 @@
-r ./requirements-convert_legacy_llama.txt