Q4_0 scale selection using RMSE

This commit is contained in:
Stephan Walter 2023-04-07 13:49:51 +02:00
parent 62cfc54f77
commit 40ebf819b0
9 changed files with 304 additions and 96 deletions

View file

@ -133,7 +133,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
default: main quantize perplexity embedding
default: main quantize quantize-stats perplexity embedding
#
# Build library

View file

@ -1,7 +1,11 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976 models/7B/ggml-model-f16.bin
5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f models/13B/ggml-model-f16.bin
4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8 models/13B/ggml-model-q4_0.bin
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth

View file

@ -17,12 +17,15 @@
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
static const char * impl_strs[] = { "simd", "reference", "rmse" };
static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");
struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
bool verbose = false;
bool per_layer_stats = false;
bool print_histogram = false;
bool reference = false;
std::vector<ggml_quantize_impl_t> include_impl;
std::vector<std::string> include_layers;
std::vector<std::string> exclude_layers;
std::vector<enum ggml_type> include_types;
@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -r, --reference\n");
fprintf(stderr, " use reference implementation (default: false)\n");
fprintf(stderr, " -i, --implementation\n");
fprintf(stderr, " select implementation (simd, reference, rmse)\n");
fprintf(stderr, " -v, --verbose\n");
fprintf(stderr, " verbose output (default: false)\n");
fprintf(stderr, " -p, --per-layer-stats\n");
@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
return INFINITY;
}
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95);
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n",
name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median);
if (print_histogram) {
printf("Error distribution:\n");
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
std::string & name,
bool print_layer_stats,
const quantize_fns_t & qfns,
bool use_reference,
ggml_quantize_impl_t impl,
const ggml_tensor * layer,
float * input_scratch,
char *quantized_scratch,
@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
input_scratch = ggml_get_data_f32(layer) + offset;
}
if (use_reference) {
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
}
qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size);
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
}
}
if (print_layer_stats) {
print_error_stats(name, layer_error, false);
print_error_stats(name, impl, layer_error, false);
}
}
@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
if (arg == "-h" || arg == "--help") {
quantize_stats_print_usage(argc, argv);
exit(0);
} else if (arg == "-r" || arg == "--reference") {
params.reference = true;
} else if (arg == "-i" || arg == "--implementation") {
if (++i >= argc) {
invalid_param = true;
break;
}
int j;
for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) {
// find match
}
if (j < GGML_QUANTIZE_IMPL_COUNT) {
params.include_impl.push_back((ggml_quantize_impl_t)j);
} else {
fprintf(stderr, "error: %s not in list of implementations\n", argv[i]);
invalid_param = true;
}
} else if (arg == "-v") {
params.verbose = true;
} else if (arg == "-p" || arg == "--per-layer-stats") {
@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
// loop through quantization types
for (int type = 0; type < GGML_TYPE_COUNT; type++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {
printf("testing %s ...\n", type_strs[i]);
}
error_stats global_stats {};
for (const auto& kv_tensor : tensors_sorted) {
if (!layer_included(params, kv_tensor.first)) {
for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) {
continue;
}
if (params.verbose) {
printf(" %s ...\n", kv_tensor.first.c_str());
}
std::string layer_name { type_strs[i] };
layer_name += "::" + kv_tensor.first;
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
params.reference,
kv_tensor.second,
input_scratch.data(),
quantized_scratch.data(),
output_scratch.data(),
global_stats
);
}
print_error_stats(type_strs[i], global_stats, params.print_histogram);
if (params.verbose) {
printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]);
}
error_stats global_stats {};
for (const auto& kv_tensor : tensors_sorted) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
if (params.verbose) {
printf(" %s ...\n", kv_tensor.first.c_str());
}
std::string layer_name { type_strs[type] };
layer_name += "::" + kv_tensor.first;
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
(ggml_quantize_impl_t)impl,
kv_tensor.second,
input_scratch.data(),
quantized_scratch.data(),
output_scratch.data(),
global_stats
);
}
print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram);
}
}
}

View file

@ -0,0 +1,76 @@
import matplotlib.pyplot as plt
# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
data = (
(-10.0, 0),
(-9.9, 1),
(-9.8, 3),
(-9.7, 65),
(-9.6, 738),
(-9.5, 5779),
(-9.4, 30880),
(-9.3, 121078),
(-9.2, 375674),
(-9.1, 941350),
(-9.0, 1990278),
(-8.9, 3635317),
(-8.8, 5891752),
(-8.7, 8678748),
(-8.6, 11771759),
(-8.5, 14873993),
(-8.4, 17594260),
(-8.3, 19553100),
(-8.2, 20415428),
(-8.1, 20017134),
(-8.0, 18357204),
(-7.9, 15597612),
(-7.8, 11993688),
(-7.7, 7842970),
(-7.6, 2880878),
(-7.5, 3478),
(-7.4, 2648437),
(-7.3, 5641970),
(-7.2, 5935890),
(-7.1, 4910790),
(-7.0, 3425891),
(-6.9, 2068250),
(-6.8, 1089883),
(-6.7, 502462),
(-6.6, 156356),
(-6.5, 205),
(-6.4, 163500),
(-6.3, 386291),
(-6.2, 423018),
(-6.1, 319360),
(-6.0, 180783),
(-5.9, 78822),
(-5.8, 28254),
(-5.7, 8698),
(-5.6, 1969),
(-5.5, 0),
(-5.4, 2069),
(-5.3, 5722),
(-5.2, 7107),
(-5.1, 5113),
(-5.0, 2332),
(-4.9, 636),
(-4.8, 130),
(-4.7, 12),
(-4.6, 1),
(-4.5, 0),
(-4.4, 3),
(-4.3, 4),
(-4.2, 8),
(-4.1, 8),
(-4.0, 27),
)
x, y = zip(*data)
fig, ax = plt.subplots()
b = ax.bar(x, y, 0.1, bottom=1)
ax.set_yscale("log")
ax.set_xlabel("scale")
ax.set_ylabel("N")
plt.title("Quantization scale factor with lowest RMS error")
plt.show()

150
ggml.c
View file

@ -73,11 +73,15 @@ static int sched_yield (void) {
Sleep (0);
return 0;
}
#define __attribute__(...)
#else
#include <pthread.h>
#include <stdatomic.h>
typedef void* thread_ret_t;
#define __declspec(...)
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@ -517,39 +521,128 @@ typedef struct {
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");
// reference implementation for deterministic creation of model files
static inline void quantize_block_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, float scale) {
uint8_t pp[QK/2];
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int l = 0; l < QK; l++) {
const float v = x[l];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / scale;
const float id = d ? 1.0f/d : 0.0f;
y->d = d;
for (int l = 0; l < QK; l += 2) {
const float v0 = x[l + 0]*id;
const float v1 = x[l + 1]*id;
int8_t vs0 = roundf(v0);
int8_t vs1 = roundf(v1);
vs0 = MIN(MAX(0 - 8, vs0), 15 - 8);
vs1 = MIN(MAX(0 - 8, vs1), 15 - 8);
const uint8_t vi0 = vs0 + 8; // guaranteed to fit into 4 bits
const uint8_t vi1 = vs1 + 8; // thanks to the clamping of the signed values above
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(y->qs, pp, sizeof(pp));
}
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;
for (int i = 0; i < nb; i++) {
quantize_block_q4_0_reference(x + i*QK, y + i, 7);
}
}
uint8_t pp[QK/2];
static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restrict y, int k) {
// For each q4_0 block, we try the following values to scale the shared float value
// and pick the one with lowest RMS error. We could do a more involved search,
// but this is a trade-off with speed of model generation and simplicity of the code.
// Operating on 8 values can reasonably be loop-unrolled or vectorized, but that is not
// manually done here.
// Values hand-picked according to histogram in examples/quantize/scale.py
// Include the value +7 of the old method to ensure we don't regress on RMSE on any block.
#define Q4_0_SCALE_CANDIDATE_COUNT 8
static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f };
assert(k % QK == 0);
const int nb = k / QK;
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int l = 0; l < QK; l++) {
const float v = x[i*QK + l];
amax = MAX(amax, fabsf(v));
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;
// find scale with lowest sum of squared errors, equivalent to lowest RMS error
float best_sqerr = +INFINITY;
float best_scale = NAN;
y[i].d = d;
for (int si = 0; si < Q4_0_SCALE_CANDIDATE_COUNT; si++) {
const float scale = candidates[si];
const float d = max / scale;
const float id = d ? 1.0f / d : 0.0f;
float sqe_acc = 0.f;
#ifdef __AVX2__
const __m256 clamp_lo = _mm256_set1_ps( 0 - 8);
const __m256 clamp_hi = _mm256_set1_ps(15 - 8);
const __m256 id256 = _mm256_set1_ps(id);
for (int l = 0; l < QK; l += 8) {
// TODO: use _mm256_load_ps once the quantize loader uses mmap
__m256 v = _mm256_loadu_ps(&x[i * QK + l]);
v = _mm256_mul_ps(v, id256);
__m256 vs = _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
vs = _mm256_min_ps(_mm256_max_ps(clamp_lo, vs), clamp_hi);
const __m256 err = _mm256_sub_ps(vs, v);
const __m256 sqe = _mm256_mul_ps(err, err);
for (int l = 0; l < QK; l += 2) {
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;
// this is far from optimal speed-wise, but ensures identical results to scalar implementation
// we have to add the floats in sqe to sqe_acc separately and in the correct order
// 8x _mm_add_ps(,_mm_permute_ps()) would work but isn't faster than this:
__declspec(align(32)) float out[8] __attribute__((aligned(32)));
_mm256_store_ps(out, sqe);
for (int ei= 0; ei < 8; ei++) {
sqe_acc += out[ei];
}
}
#else
for (int l = 0; l < QK; l++) {
const float v = x[i * QK + l] * id;
int8_t vs = roundf(v);
vs = MIN(MAX(0 - 8, vs), 15 - 8);
sqe_acc += (vs - v) * (vs - v);
}
#endif
// the square error sum is calculated on un-scaled q's inside the inner loop
sqe_acc *= d * d;
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
assert(vi0 < 16);
assert(vi1 < 16);
pp[l/2] = vi0 | (vi1 << 4);
if (best_sqerr > sqe_acc) {
best_sqerr = sqe_acc;
best_scale = scale;
}
}
memcpy(y[i].qs, pp, sizeof(pp));
assert(isfinite(best_sqerr));
assert(isfinite(best_scale));
quantize_block_q4_0_reference(x + i * QK, y + i, best_scale);
}
}
@ -6564,17 +6657,28 @@ static void ggml_compute_forward_mul_mat_f16_f32(
//}
}
static void quantize_row_q_missing(const float * x, void * y, int k) {
(void)x; (void)y; (void)k;
assert(false);
}
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = {
.dequantize_row_q = dequantize_row_q4_0,
.quantize_row_q = quantize_row_q4_0,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
.quantize_row_q = {
[GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_0,
[GGML_QUANTIZE_IMPL_REFERENCE] = (quantize_row_q_t)quantize_row_q4_0_reference,
[GGML_QUANTIZE_IMPL_RMSE] = (quantize_row_q_t)quantize_row_q4_0_rmse,
},
.vec_dot_q = ggml_vec_dot_q4_0,
},
[GGML_TYPE_Q4_1] = {
.dequantize_row_q = dequantize_row_q4_1,
.quantize_row_q = quantize_row_q4_1,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
.quantize_row_q = {
[GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_1,
[GGML_QUANTIZE_IMPL_REFERENCE] = quantize_row_q4_1_reference,
[GGML_QUANTIZE_IMPL_RMSE] = quantize_row_q_missing,
},
.vec_dot_q = ggml_vec_dot_q4_1,
},
};
@ -6632,7 +6736,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
GGML_ASSERT(ne3 == ne13);
const enum ggml_type type = src0->type;
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q[GGML_QUANTIZE_IMPL_SIMD];
vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
// we don't support permuted src0 or src1
@ -10602,7 +10706,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
for (int j = 0; j < n; j += k) {
block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK;
quantize_row_q4_0_reference(src + j, y, k);
quantize_row_q4_0_rmse(src + j, y, k);
for (int i = 0; i < nb; i++) {
for (int l = 0; l < QK; l += 2) {

22
ggml.h
View file

@ -788,20 +788,20 @@ int ggml_cpu_has_vsx(void);
// Internal types and functions exposed for tests and benchmarks
//
#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#endif
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef void (*dequantize_row_q_t)(const void * x, float * y, int k);
typedef void (*quantize_row_q_t)(const float * x, void * y, int k);
typedef void (*vec_dot_q_t)(const int n, float * s, const void * x, const void * y);
typedef enum {
GGML_QUANTIZE_IMPL_SIMD,
GGML_QUANTIZE_IMPL_REFERENCE,
GGML_QUANTIZE_IMPL_RMSE,
GGML_QUANTIZE_IMPL_COUNT
} ggml_quantize_impl_t;
typedef struct {
dequantize_row_q_t dequantize_row_q;
quantize_row_q_t quantize_row_q;
quantize_row_q_t quantize_row_q_reference;
quantize_row_q_t quantize_row_q[GGML_QUANTIZE_IMPL_COUNT];
vec_dot_q_t vec_dot_q;
} quantize_fns_t;

View file

@ -644,7 +644,7 @@ static bool llama_model_load(
size_t total_size = 0;
model.n_loaded = 0;
while (true) {
while (size_t(fin.tellg()) + 12 < file_size) {
int32_t n_dims;
int32_t length;
int32_t ftype;
@ -653,10 +653,6 @@ static bool llama_model_load(
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
@ -707,6 +703,10 @@ static bool llama_model_load(
offset = (offset + 31) & -32;
tensor->data = mm_addr + offset;
fin.seekg(offset + tensor_data_size);
if (fin.eof()) {
fprintf(stderr, "%s: Truncated file?\n", __func__);
return false;
}
total_size += tensor_data_size;
model.n_loaded++;
@ -717,6 +717,15 @@ static bool llama_model_load(
}
}
uint32_t version_minor = 0;
fin.read((char *)&version_minor, sizeof(version_minor));
if (fin.eof() || version_minor < LLAMA_FILE_VERSION_MINOR) {
static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Provide a helpful message that explains why the user may want to update their files");
if (model.hparams.f16 == 2) {
fprintf(stderr, "%s: WARN no minor version detected - your file will work but consider re-creating it for better quantization\n", __func__);
}
}
fin.close();
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
@ -1572,6 +1581,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Check if this condition needs updating for minimal model checksum changes");
if ((LLAMA_FILE_VERSION_MINOR > 1) || (itype == 2)) {
uint32_t version_minor = LLAMA_FILE_VERSION_MINOR;
fout.write((char *)&version_minor, sizeof(version_minor));
}
finp.close();
fout.close();

View file

@ -20,6 +20,7 @@
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_VERSION_MINOR 1 // for backward-compatible changes
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files

View file

@ -13,18 +13,7 @@ int main(void) {
src[i] = (float)(i + 1);
}
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
assert(size == 20);
float max_result = ((float *)dst)[0];
float max_expected = src[31] / ((1 << 3) - 1);
assert(max_result == max_expected);
for (int i = 0; i < QK; i++) {
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
assert(q4_result == q4_expected);
}
size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
size_t size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
assert(size == 24);
float delta_result = ((float *)dst)[0];
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);