Q4_0 scale selection using RMSE
This commit is contained in:
parent
62cfc54f77
commit
40ebf819b0
9 changed files with 304 additions and 96 deletions
2
Makefile
2
Makefile
|
@ -133,7 +133,7 @@ $(info I CC: $(CCV))
|
|||
$(info I CXX: $(CXXV))
|
||||
$(info )
|
||||
|
||||
default: main quantize perplexity embedding
|
||||
default: main quantize quantize-stats perplexity embedding
|
||||
|
||||
#
|
||||
# Build library
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
|
||||
0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976 models/7B/ggml-model-f16.bin
|
||||
5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin
|
||||
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
|
||||
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
|
||||
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
|
||||
7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f models/13B/ggml-model-f16.bin
|
||||
4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8 models/13B/ggml-model-q4_0.bin
|
||||
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
|
||||
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
|
||||
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
|
||||
|
|
|
@ -17,12 +17,15 @@
|
|||
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
|
||||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
|
||||
|
||||
static const char * impl_strs[] = { "simd", "reference", "rmse" };
|
||||
static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");
|
||||
|
||||
struct quantize_stats_params {
|
||||
std::string model = "models/7B/ggml-model-f16.bin";
|
||||
bool verbose = false;
|
||||
bool per_layer_stats = false;
|
||||
bool print_histogram = false;
|
||||
bool reference = false;
|
||||
std::vector<ggml_quantize_impl_t> include_impl;
|
||||
std::vector<std::string> include_layers;
|
||||
std::vector<std::string> exclude_layers;
|
||||
std::vector<enum ggml_type> include_types;
|
||||
|
@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
|||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -r, --reference\n");
|
||||
fprintf(stderr, " use reference implementation (default: false)\n");
|
||||
fprintf(stderr, " -i, --implementation\n");
|
||||
fprintf(stderr, " select implementation (simd, reference, rmse)\n");
|
||||
fprintf(stderr, " -v, --verbose\n");
|
||||
fprintf(stderr, " verbose output (default: false)\n");
|
||||
fprintf(stderr, " -p, --per-layer-stats\n");
|
||||
|
@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
|
|||
return INFINITY;
|
||||
}
|
||||
|
||||
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||
void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
|
||||
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
||||
double median = find_quantile(stats, .5);
|
||||
double pct95 = find_quantile(stats, .95);
|
||||
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
|
||||
printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n",
|
||||
name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median);
|
||||
if (print_histogram) {
|
||||
printf("Error distribution:\n");
|
||||
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
|
||||
|
@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
|
|||
std::string & name,
|
||||
bool print_layer_stats,
|
||||
const quantize_fns_t & qfns,
|
||||
bool use_reference,
|
||||
ggml_quantize_impl_t impl,
|
||||
const ggml_tensor * layer,
|
||||
float * input_scratch,
|
||||
char *quantized_scratch,
|
||||
|
@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
|
|||
input_scratch = ggml_get_data_f32(layer) + offset;
|
||||
}
|
||||
|
||||
if (use_reference) {
|
||||
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
|
||||
} else {
|
||||
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
|
||||
}
|
||||
qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size);
|
||||
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
|
||||
|
||||
update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
|
||||
|
@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
|
|||
}
|
||||
}
|
||||
if (print_layer_stats) {
|
||||
print_error_stats(name, layer_error, false);
|
||||
print_error_stats(name, impl, layer_error, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
|
|||
if (arg == "-h" || arg == "--help") {
|
||||
quantize_stats_print_usage(argc, argv);
|
||||
exit(0);
|
||||
} else if (arg == "-r" || arg == "--reference") {
|
||||
params.reference = true;
|
||||
} else if (arg == "-i" || arg == "--implementation") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
int j;
|
||||
for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) {
|
||||
// find match
|
||||
}
|
||||
if (j < GGML_QUANTIZE_IMPL_COUNT) {
|
||||
params.include_impl.push_back((ggml_quantize_impl_t)j);
|
||||
} else {
|
||||
fprintf(stderr, "error: %s not in list of implementations\n", argv[i]);
|
||||
invalid_param = true;
|
||||
}
|
||||
} else if (arg == "-v") {
|
||||
params.verbose = true;
|
||||
} else if (arg == "-p" || arg == "--per-layer-stats") {
|
||||
|
@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
|
|||
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
|
||||
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
|
||||
|
||||
// loop throught quantization types
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
// loop through quantization types
|
||||
for (int type = 0; type < GGML_TYPE_COUNT; type++) {
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
printf("testing %s ...\n", type_strs[i]);
|
||||
}
|
||||
|
||||
error_stats global_stats {};
|
||||
|
||||
for (const auto& kv_tensor : tensors_sorted) {
|
||||
if (!layer_included(params, kv_tensor.first)) {
|
||||
for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
|
||||
if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||
}
|
||||
std::string layer_name { type_strs[i] };
|
||||
layer_name += "::" + kv_tensor.first;
|
||||
test_roundtrip_on_layer(
|
||||
layer_name,
|
||||
params.per_layer_stats,
|
||||
qfns,
|
||||
params.reference,
|
||||
kv_tensor.second,
|
||||
input_scratch.data(),
|
||||
quantized_scratch.data(),
|
||||
output_scratch.data(),
|
||||
global_stats
|
||||
);
|
||||
}
|
||||
|
||||
print_error_stats(type_strs[i], global_stats, params.print_histogram);
|
||||
if (params.verbose) {
|
||||
printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]);
|
||||
}
|
||||
|
||||
error_stats global_stats {};
|
||||
|
||||
for (const auto& kv_tensor : tensors_sorted) {
|
||||
if (!layer_included(params, kv_tensor.first)) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||
}
|
||||
std::string layer_name { type_strs[type] };
|
||||
layer_name += "::" + kv_tensor.first;
|
||||
test_roundtrip_on_layer(
|
||||
layer_name,
|
||||
params.per_layer_stats,
|
||||
qfns,
|
||||
(ggml_quantize_impl_t)impl,
|
||||
kv_tensor.second,
|
||||
input_scratch.data(),
|
||||
quantized_scratch.data(),
|
||||
output_scratch.data(),
|
||||
global_stats
|
||||
);
|
||||
}
|
||||
|
||||
print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
76
examples/quantize/scale.py
Normal file
76
examples/quantize/scale.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
import matplotlib.pyplot as plt
|
||||
|
||||
# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
|
||||
# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
|
||||
data = (
|
||||
(-10.0, 0),
|
||||
(-9.9, 1),
|
||||
(-9.8, 3),
|
||||
(-9.7, 65),
|
||||
(-9.6, 738),
|
||||
(-9.5, 5779),
|
||||
(-9.4, 30880),
|
||||
(-9.3, 121078),
|
||||
(-9.2, 375674),
|
||||
(-9.1, 941350),
|
||||
(-9.0, 1990278),
|
||||
(-8.9, 3635317),
|
||||
(-8.8, 5891752),
|
||||
(-8.7, 8678748),
|
||||
(-8.6, 11771759),
|
||||
(-8.5, 14873993),
|
||||
(-8.4, 17594260),
|
||||
(-8.3, 19553100),
|
||||
(-8.2, 20415428),
|
||||
(-8.1, 20017134),
|
||||
(-8.0, 18357204),
|
||||
(-7.9, 15597612),
|
||||
(-7.8, 11993688),
|
||||
(-7.7, 7842970),
|
||||
(-7.6, 2880878),
|
||||
(-7.5, 3478),
|
||||
(-7.4, 2648437),
|
||||
(-7.3, 5641970),
|
||||
(-7.2, 5935890),
|
||||
(-7.1, 4910790),
|
||||
(-7.0, 3425891),
|
||||
(-6.9, 2068250),
|
||||
(-6.8, 1089883),
|
||||
(-6.7, 502462),
|
||||
(-6.6, 156356),
|
||||
(-6.5, 205),
|
||||
(-6.4, 163500),
|
||||
(-6.3, 386291),
|
||||
(-6.2, 423018),
|
||||
(-6.1, 319360),
|
||||
(-6.0, 180783),
|
||||
(-5.9, 78822),
|
||||
(-5.8, 28254),
|
||||
(-5.7, 8698),
|
||||
(-5.6, 1969),
|
||||
(-5.5, 0),
|
||||
(-5.4, 2069),
|
||||
(-5.3, 5722),
|
||||
(-5.2, 7107),
|
||||
(-5.1, 5113),
|
||||
(-5.0, 2332),
|
||||
(-4.9, 636),
|
||||
(-4.8, 130),
|
||||
(-4.7, 12),
|
||||
(-4.6, 1),
|
||||
(-4.5, 0),
|
||||
(-4.4, 3),
|
||||
(-4.3, 4),
|
||||
(-4.2, 8),
|
||||
(-4.1, 8),
|
||||
(-4.0, 27),
|
||||
)
|
||||
x, y = zip(*data)
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
b = ax.bar(x, y, 0.1, bottom=1)
|
||||
ax.set_yscale("log")
|
||||
ax.set_xlabel("scale")
|
||||
ax.set_ylabel("N")
|
||||
plt.title("Quantization scale factor with lowest RMS error")
|
||||
plt.show()
|
150
ggml.c
150
ggml.c
|
@ -73,11 +73,15 @@ static int sched_yield (void) {
|
|||
Sleep (0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define __attribute__(...)
|
||||
#else
|
||||
#include <pthread.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
typedef void* thread_ret_t;
|
||||
|
||||
#define __declspec(...)
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
|
@ -517,39 +521,128 @@ typedef struct {
|
|||
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
static inline void quantize_block_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, float scale) {
|
||||
uint8_t pp[QK/2];
|
||||
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int l = 0; l < QK; l++) {
|
||||
const float v = x[l];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / scale;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y->d = d;
|
||||
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
const float v0 = x[l + 0]*id;
|
||||
const float v1 = x[l + 1]*id;
|
||||
|
||||
int8_t vs0 = roundf(v0);
|
||||
int8_t vs1 = roundf(v1);
|
||||
|
||||
vs0 = MIN(MAX(0 - 8, vs0), 15 - 8);
|
||||
vs1 = MIN(MAX(0 - 8, vs1), 15 - 8);
|
||||
|
||||
const uint8_t vi0 = vs0 + 8; // guaranteed to fit into 4 bits
|
||||
const uint8_t vi1 = vs1 + 8; // thanks to the clamping of the signed values above
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
}
|
||||
|
||||
memcpy(y->qs, pp, sizeof(pp));
|
||||
}
|
||||
|
||||
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
|
||||
assert(k % QK == 0);
|
||||
const int nb = k / QK;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
quantize_block_q4_0_reference(x + i*QK, y + i, 7);
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t pp[QK/2];
|
||||
static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restrict y, int k) {
|
||||
// For each q4_0 block, we try the following values to scale the shared float value
|
||||
// and pick the one with lowest RMS error. We could do a more involved search,
|
||||
// but this is a trade-off with speed of model generation and simplicity of the code.
|
||||
// Operating on 8 values can reasonably be loop-unrolled or vectorized, but that is not
|
||||
// manually done here.
|
||||
// Values hand-picked according to histogram in examples/quantize/scale.py
|
||||
// Include the value +7 of the old method to ensure we don't regress on RMSE on any block.
|
||||
#define Q4_0_SCALE_CANDIDATE_COUNT 8
|
||||
static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f };
|
||||
|
||||
assert(k % QK == 0);
|
||||
const int nb = k / QK;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int l = 0; l < QK; l++) {
|
||||
const float v = x[i*QK + l];
|
||||
amax = MAX(amax, fabsf(v));
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 3) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
// find scale with lowest sum of squared errors, equivalent to lowest RMS error
|
||||
float best_sqerr = +INFINITY;
|
||||
float best_scale = NAN;
|
||||
|
||||
y[i].d = d;
|
||||
for (int si = 0; si < Q4_0_SCALE_CANDIDATE_COUNT; si++) {
|
||||
const float scale = candidates[si];
|
||||
const float d = max / scale;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
float sqe_acc = 0.f;
|
||||
#ifdef __AVX2__
|
||||
const __m256 clamp_lo = _mm256_set1_ps( 0 - 8);
|
||||
const __m256 clamp_hi = _mm256_set1_ps(15 - 8);
|
||||
const __m256 id256 = _mm256_set1_ps(id);
|
||||
for (int l = 0; l < QK; l += 8) {
|
||||
// TODO: use _mm256_load_ps once the quantize loader uses mmap
|
||||
__m256 v = _mm256_loadu_ps(&x[i * QK + l]);
|
||||
v = _mm256_mul_ps(v, id256);
|
||||
__m256 vs = _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
||||
vs = _mm256_min_ps(_mm256_max_ps(clamp_lo, vs), clamp_hi);
|
||||
const __m256 err = _mm256_sub_ps(vs, v);
|
||||
const __m256 sqe = _mm256_mul_ps(err, err);
|
||||
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
const float v0 = x[i*QK + l + 0]*id;
|
||||
const float v1 = x[i*QK + l + 1]*id;
|
||||
// this is far from optimal speed-wise, but ensures identical results to scalar implementation
|
||||
// we have to add the floats in sqe to sqe_acc separately and in the correct order
|
||||
// 8x _mm_add_ps(,_mm_permute_ps()) would work but isn't faster than this:
|
||||
__declspec(align(32)) float out[8] __attribute__((aligned(32)));
|
||||
_mm256_store_ps(out, sqe);
|
||||
for (int ei= 0; ei < 8; ei++) {
|
||||
sqe_acc += out[ei];
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int l = 0; l < QK; l++) {
|
||||
const float v = x[i * QK + l] * id;
|
||||
int8_t vs = roundf(v);
|
||||
vs = MIN(MAX(0 - 8, vs), 15 - 8);
|
||||
sqe_acc += (vs - v) * (vs - v);
|
||||
}
|
||||
#endif
|
||||
// the square error sum is calculated on un-scaled q's inside the inner loop
|
||||
sqe_acc *= d * d;
|
||||
|
||||
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
|
||||
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
|
||||
|
||||
assert(vi0 < 16);
|
||||
assert(vi1 < 16);
|
||||
|
||||
pp[l/2] = vi0 | (vi1 << 4);
|
||||
if (best_sqerr > sqe_acc) {
|
||||
best_sqerr = sqe_acc;
|
||||
best_scale = scale;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(y[i].qs, pp, sizeof(pp));
|
||||
assert(isfinite(best_sqerr));
|
||||
assert(isfinite(best_scale));
|
||||
quantize_block_q4_0_reference(x + i * QK, y + i, best_scale);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6564,17 +6657,28 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
//}
|
||||
}
|
||||
|
||||
static void quantize_row_q_missing(const float * x, void * y, int k) {
|
||||
(void)x; (void)y; (void)k;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_Q4_0] = {
|
||||
.dequantize_row_q = dequantize_row_q4_0,
|
||||
.quantize_row_q = quantize_row_q4_0,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
|
||||
.quantize_row_q = {
|
||||
[GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_0,
|
||||
[GGML_QUANTIZE_IMPL_REFERENCE] = (quantize_row_q_t)quantize_row_q4_0_reference,
|
||||
[GGML_QUANTIZE_IMPL_RMSE] = (quantize_row_q_t)quantize_row_q4_0_rmse,
|
||||
},
|
||||
.vec_dot_q = ggml_vec_dot_q4_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_1] = {
|
||||
.dequantize_row_q = dequantize_row_q4_1,
|
||||
.quantize_row_q = quantize_row_q4_1,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
||||
.quantize_row_q = {
|
||||
[GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_1,
|
||||
[GGML_QUANTIZE_IMPL_REFERENCE] = quantize_row_q4_1_reference,
|
||||
[GGML_QUANTIZE_IMPL_RMSE] = quantize_row_q_missing,
|
||||
},
|
||||
.vec_dot_q = ggml_vec_dot_q4_1,
|
||||
},
|
||||
};
|
||||
|
@ -6632,7 +6736,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
|
||||
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q[GGML_QUANTIZE_IMPL_SIMD];
|
||||
vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
|
@ -10602,7 +10706,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
|
|||
for (int j = 0; j < n; j += k) {
|
||||
block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK;
|
||||
|
||||
quantize_row_q4_0_reference(src + j, y, k);
|
||||
quantize_row_q4_0_rmse(src + j, y, k);
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int l = 0; l < QK; l += 2) {
|
||||
|
|
22
ggml.h
22
ggml.h
|
@ -788,20 +788,20 @@ int ggml_cpu_has_vsx(void);
|
|||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||
typedef void (*dequantize_row_q_t)(const void * x, float * y, int k);
|
||||
typedef void (*quantize_row_q_t)(const float * x, void * y, int k);
|
||||
typedef void (*vec_dot_q_t)(const int n, float * s, const void * x, const void * y);
|
||||
|
||||
typedef enum {
|
||||
GGML_QUANTIZE_IMPL_SIMD,
|
||||
GGML_QUANTIZE_IMPL_REFERENCE,
|
||||
GGML_QUANTIZE_IMPL_RMSE,
|
||||
GGML_QUANTIZE_IMPL_COUNT
|
||||
} ggml_quantize_impl_t;
|
||||
|
||||
typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
quantize_row_q_t quantize_row_q[GGML_QUANTIZE_IMPL_COUNT];
|
||||
vec_dot_q_t vec_dot_q;
|
||||
} quantize_fns_t;
|
||||
|
||||
|
|
25
llama.cpp
25
llama.cpp
|
@ -644,7 +644,7 @@ static bool llama_model_load(
|
|||
size_t total_size = 0;
|
||||
model.n_loaded = 0;
|
||||
|
||||
while (true) {
|
||||
while (size_t(fin.tellg()) + 12 < file_size) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
@ -653,10 +653,6 @@ static bool llama_model_load(
|
|||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
|
@ -707,6 +703,10 @@ static bool llama_model_load(
|
|||
offset = (offset + 31) & -32;
|
||||
tensor->data = mm_addr + offset;
|
||||
fin.seekg(offset + tensor_data_size);
|
||||
if (fin.eof()) {
|
||||
fprintf(stderr, "%s: Truncated file?\n", __func__);
|
||||
return false;
|
||||
}
|
||||
total_size += tensor_data_size;
|
||||
model.n_loaded++;
|
||||
|
||||
|
@ -717,6 +717,15 @@ static bool llama_model_load(
|
|||
}
|
||||
}
|
||||
|
||||
uint32_t version_minor = 0;
|
||||
fin.read((char *)&version_minor, sizeof(version_minor));
|
||||
if (fin.eof() || version_minor < LLAMA_FILE_VERSION_MINOR) {
|
||||
static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Provide a helpful message that explains why the user may want to update their files");
|
||||
if (model.hparams.f16 == 2) {
|
||||
fprintf(stderr, "%s: WARN no minor version detected - your file will work but consider re-creating it for better quantization\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
||||
|
@ -1572,6 +1581,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
}
|
||||
}
|
||||
|
||||
static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Check if this condition needs updating for minimal model checksum changes");
|
||||
if ((LLAMA_FILE_VERSION_MINOR > 1) || (itype == 2)) {
|
||||
uint32_t version_minor = LLAMA_FILE_VERSION_MINOR;
|
||||
fout.write((char *)&version_minor, sizeof(version_minor));
|
||||
}
|
||||
|
||||
finp.close();
|
||||
fout.close();
|
||||
|
||||
|
|
1
llama.h
1
llama.h
|
@ -20,6 +20,7 @@
|
|||
#endif
|
||||
|
||||
#define LLAMA_FILE_VERSION 1
|
||||
#define LLAMA_FILE_VERSION_MINOR 1 // for backward-compatible changes
|
||||
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
|
||||
|
||||
|
|
|
@ -13,18 +13,7 @@ int main(void) {
|
|||
src[i] = (float)(i + 1);
|
||||
}
|
||||
|
||||
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
|
||||
assert(size == 20);
|
||||
float max_result = ((float *)dst)[0];
|
||||
float max_expected = src[31] / ((1 << 3) - 1);
|
||||
assert(max_result == max_expected);
|
||||
for (int i = 0; i < QK; i++) {
|
||||
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
|
||||
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
|
||||
size_t size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
|
||||
assert(size == 24);
|
||||
float delta_result = ((float *)dst)[0];
|
||||
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue