Q4_0 scale selection using RMSE

2023-04-07 13:49:51 +02:00 · 2023-04-07 13:49:51 +02:00 · 40ebf819b0
commit 40ebf819b0
parent 62cfc54f77
9 changed files with 304 additions and 96 deletions
--- a/2
+++ b/2
@ -133,7 +133,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main quantize perplexity embedding
+default: main quantize quantize-stats perplexity embedding

 #
 # Build library
--- a/4
+++ b/4
@ -1,7 +1,11 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
+0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976  models/7B/ggml-model-f16.bin
+5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3  models/7B/ggml-model-q4_0.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
+7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f  models/13B/ggml-model-f16.bin
+4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8  models/13B/ggml-model-q4_0.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -17,12 +17,15 @@
 static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
 static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");

+static const char * impl_strs[] = { "simd", "reference", "rmse" };
+static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list");
+
 struct quantize_stats_params {
    std::string model = "models/7B/ggml-model-f16.bin";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
-    bool reference = false;
+    std::vector<ggml_quantize_impl_t> include_impl;
    std::vector<std::string> include_layers;
    std::vector<std::string> exclude_layers;
    std::vector<enum ggml_type> include_types;
@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -r, --reference\n");
-    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -i, --implementation\n");
+    fprintf(stderr, "                        select implementation (simd, reference, rmse)\n");
    fprintf(stderr, "  -v, --verbose\n");
    fprintf(stderr, "                        verbose output (default: false)\n");
    fprintf(stderr, "  -p, --per-layer-stats\n");
@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) {
    return INFINITY;
 }

-void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) {
    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
    double median = find_quantile(stats, .5);
    double pct95 = find_quantile(stats, .95);
-    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n",
+        name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median);
    if (print_histogram) {
        printf("Error distribution:\n");
        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
@ -136,7 +140,7 @@ void test_roundtrip_on_layer(
        std::string & name,
        bool print_layer_stats,
        const quantize_fns_t & qfns,
-        bool use_reference,
+        ggml_quantize_impl_t impl,
        const ggml_tensor * layer,
        float * input_scratch,
        char *quantized_scratch,
@ -158,11 +162,7 @@ void test_roundtrip_on_layer(
            input_scratch = ggml_get_data_f32(layer) + offset;
        }

-        if (use_reference) {
-            qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
-        } else {
-            qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
-        }
+        qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size);
        qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);

        update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
@ -171,7 +171,7 @@ void test_roundtrip_on_layer(
        }
    }
    if (print_layer_stats) {
-        print_error_stats(name, layer_error, false);
+        print_error_stats(name, impl, layer_error, false);
    }
 }

@ -190,8 +190,21 @@ int main(int argc, char ** argv) {
        if (arg == "-h" || arg == "--help") {
            quantize_stats_print_usage(argc, argv);
            exit(0);
-        } else if (arg == "-r" || arg == "--reference") {
-            params.reference = true;
+        } else if (arg == "-i" || arg == "--implementation") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) {
+                // find match
+            }
+            if (j < GGML_QUANTIZE_IMPL_COUNT) {
+                params.include_impl.push_back((ggml_quantize_impl_t)j);
+            } else {
+                fprintf(stderr, "error: %s not in list of implementations\n", argv[i]);
+                invalid_param = true;
+            }
        } else if (arg == "-v") {
            params.verbose = true;
        } else if (arg == "-p" || arg == "--per-layer-stats") {
@ -302,42 +315,48 @@ int main(int argc, char ** argv) {
    std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
    std::vector<float> output_scratch(SCRATCH_ELEMENTS);

-    // loop throught quantization types
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+    // loop through quantization types
+    for (int type = 0; type < GGML_TYPE_COUNT; type++) {
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) {
            continue;
        }
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
-            if (params.verbose) {
-                printf("testing %s ...\n",  type_strs[i]);
-            }
-
-            error_stats global_stats {};
-
-            for (const auto& kv_tensor : tensors_sorted) {
-                if (!layer_included(params, kv_tensor.first)) {
+            for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) {
+                if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) {
                    continue;
                }
-                if (params.verbose) {
-                    printf("  %s ...\n",  kv_tensor.first.c_str());
-                }
-                std::string layer_name { type_strs[i] };
-                layer_name += "::" + kv_tensor.first;
-                test_roundtrip_on_layer(
-                        layer_name,
-                        params.per_layer_stats,
-                        qfns,
-                        params.reference,
-                        kv_tensor.second,
-                        input_scratch.data(),
-                        quantized_scratch.data(),
-                        output_scratch.data(),
-                        global_stats
-                );
-            }

-            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+                if (params.verbose) {
+                    printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]);
+                }
+
+                error_stats global_stats {};
+
+                for (const auto& kv_tensor : tensors_sorted) {
+                    if (!layer_included(params, kv_tensor.first)) {
+                        continue;
+                    }
+                    if (params.verbose) {
+                        printf("  %s ...\n",  kv_tensor.first.c_str());
+                    }
+                    std::string layer_name { type_strs[type] };
+                    layer_name += "::" + kv_tensor.first;
+                    test_roundtrip_on_layer(
+                            layer_name,
+                            params.per_layer_stats,
+                            qfns,
+                            (ggml_quantize_impl_t)impl,
+                            kv_tensor.second,
+                            input_scratch.data(),
+                            quantized_scratch.data(),
+                            output_scratch.data(),
+                            global_stats
+                    );
+                }
+
+                print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram);
+            }
        }
    }

--- a/examples/quantize/scale.py
+++ b/examples/quantize/scale.py
@ -0,0 +1,76 @@
+import matplotlib.pyplot as plt
+
+# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor.
+# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE.
+data = (
+    (-10.0, 0),
+    (-9.9, 1),
+    (-9.8, 3),
+    (-9.7, 65),
+    (-9.6, 738),
+    (-9.5, 5779),
+    (-9.4, 30880),
+    (-9.3, 121078),
+    (-9.2, 375674),
+    (-9.1, 941350),
+    (-9.0, 1990278),
+    (-8.9, 3635317),
+    (-8.8, 5891752),
+    (-8.7, 8678748),
+    (-8.6, 11771759),
+    (-8.5, 14873993),
+    (-8.4, 17594260),
+    (-8.3, 19553100),
+    (-8.2, 20415428),
+    (-8.1, 20017134),
+    (-8.0, 18357204),
+    (-7.9, 15597612),
+    (-7.8, 11993688),
+    (-7.7, 7842970),
+    (-7.6, 2880878),
+    (-7.5, 3478),
+    (-7.4, 2648437),
+    (-7.3, 5641970),
+    (-7.2, 5935890),
+    (-7.1, 4910790),
+    (-7.0, 3425891),
+    (-6.9, 2068250),
+    (-6.8, 1089883),
+    (-6.7, 502462),
+    (-6.6, 156356),
+    (-6.5, 205),
+    (-6.4, 163500),
+    (-6.3, 386291),
+    (-6.2, 423018),
+    (-6.1, 319360),
+    (-6.0, 180783),
+    (-5.9, 78822),
+    (-5.8, 28254),
+    (-5.7, 8698),
+    (-5.6, 1969),
+    (-5.5, 0),
+    (-5.4, 2069),
+    (-5.3, 5722),
+    (-5.2, 7107),
+    (-5.1, 5113),
+    (-5.0, 2332),
+    (-4.9, 636),
+    (-4.8, 130),
+    (-4.7, 12),
+    (-4.6, 1),
+    (-4.5, 0),
+    (-4.4, 3),
+    (-4.3, 4),
+    (-4.2, 8),
+    (-4.1, 8),
+    (-4.0, 27),
+)
+x, y = zip(*data)
+
+fig, ax = plt.subplots()
+b = ax.bar(x, y, 0.1, bottom=1)
+ax.set_yscale("log")
+ax.set_xlabel("scale")
+ax.set_ylabel("N")
+plt.title("Quantization scale factor with lowest RMS error")
+plt.show()
--- a/ggml.c
+++ b/ggml.c
@ -73,11 +73,15 @@ static int sched_yield (void) {
    Sleep (0);
    return 0;
 }
+
+#define __attribute__(...)
 #else
 #include <pthread.h>
 #include <stdatomic.h>

 typedef void* thread_ret_t;
+
+#define __declspec(...)
 #endif

 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@ -517,39 +521,128 @@ typedef struct {
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");

 // reference implementation for deterministic creation of model files
+static inline void quantize_block_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, float scale) {
+    uint8_t pp[QK/2];
+
+    float amax = 0.0f; // absolute max
+    float max = 0.0f;
+
+    for (int l = 0; l < QK; l++) {
+        const float v = x[l];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            max = v;
+        }
+    }
+
+    const float d = max / scale;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    y->d = d;
+
+    for (int l = 0; l < QK; l += 2) {
+        const float v0 = x[l + 0]*id;
+        const float v1 = x[l + 1]*id;
+
+        int8_t vs0 = roundf(v0);
+        int8_t vs1 = roundf(v1);
+
+        vs0 = MIN(MAX(0 - 8, vs0), 15 - 8);
+        vs1 = MIN(MAX(0 - 8, vs1), 15 - 8);
+
+        const uint8_t vi0 = vs0 + 8;    // guaranteed to fit into 4 bits
+        const uint8_t vi1 = vs1 + 8;    // thanks to the clamping of the signed values above
+
+        pp[l/2] = vi0 | (vi1 << 4);
+    }
+
+    memcpy(y->qs, pp, sizeof(pp));
+}
+
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
    assert(k % QK == 0);
    const int nb = k / QK;
+    for (int i = 0; i < nb; i++) {
+        quantize_block_q4_0_reference(x + i*QK, y + i, 7);
+    }
+}

-    uint8_t pp[QK/2];
+static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restrict y, int k) {
+    // For each q4_0 block, we try the following values to scale the shared float value
+    // and pick the one with lowest RMS error. We could do a more involved search,
+    // but this is a trade-off with speed of model generation and simplicity of the code.
+    // Operating on 8 values can reasonably be loop-unrolled or vectorized, but that is not
+    // manually done here.
+    // Values hand-picked according to histogram in examples/quantize/scale.py
+    // Include the value +7 of the old method to ensure we don't regress on RMSE on any block.
+    #define Q4_0_SCALE_CANDIDATE_COUNT 8
+    static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f };
+
+    assert(k % QK == 0);
+    const int nb = k / QK;

    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
+        float max = 0.0f;

        for (int l = 0; l < QK; l++) {
            const float v = x[i*QK + l];
-            amax = MAX(amax, fabsf(v));
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
        }

-        const float d = amax / ((1 << 3) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
+        // find scale with lowest sum of squared errors, equivalent to lowest RMS error
+        float best_sqerr = +INFINITY;
+        float best_scale = NAN;

-        y[i].d = d;
+        for (int si = 0; si < Q4_0_SCALE_CANDIDATE_COUNT; si++) {
+            const float scale = candidates[si];
+            const float d = max / scale;
+            const float id = d ? 1.0f / d : 0.0f;
+            float sqe_acc = 0.f;
+#ifdef __AVX2__
+            const __m256 clamp_lo = _mm256_set1_ps( 0 - 8);
+            const __m256 clamp_hi = _mm256_set1_ps(15 - 8);
+            const __m256 id256    = _mm256_set1_ps(id);
+            for (int l = 0; l < QK; l += 8) {
+                // TODO: use _mm256_load_ps once the quantize loader uses mmap
+                __m256 v = _mm256_loadu_ps(&x[i * QK + l]);
+                v = _mm256_mul_ps(v, id256);
+                __m256 vs = _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+                vs = _mm256_min_ps(_mm256_max_ps(clamp_lo, vs), clamp_hi);
+                const __m256 err = _mm256_sub_ps(vs, v);
+                const __m256 sqe = _mm256_mul_ps(err, err);

-        for (int l = 0; l < QK; l += 2) {
-            const float v0 = x[i*QK + l + 0]*id;
-            const float v1 = x[i*QK + l + 1]*id;
+                // this is far from optimal speed-wise, but ensures identical results to scalar implementation
+                // we have to add the floats in sqe to sqe_acc separately and in the correct order
+                // 8x _mm_add_ps(,_mm_permute_ps()) would work but isn't faster than this:
+                __declspec(align(32)) float out[8] __attribute__((aligned(32)));
+                _mm256_store_ps(out, sqe);
+                for (int ei= 0; ei < 8; ei++) {
+                    sqe_acc += out[ei];
+                }
+            }
+#else
+            for (int l = 0; l < QK; l++) {
+                const float v = x[i * QK + l] * id;
+                int8_t vs = roundf(v);
+                vs = MIN(MAX(0 - 8, vs), 15 - 8);
+                sqe_acc += (vs - v) * (vs - v);
+            }
+#endif
+            // the square error sum is calculated on un-scaled q's inside the inner loop
+            sqe_acc *= d * d;

-            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
-            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
+            if (best_sqerr > sqe_acc) {
+                best_sqerr = sqe_acc;
+                best_scale = scale;
+            }
        }
-
-        memcpy(y[i].qs, pp, sizeof(pp));
+        assert(isfinite(best_sqerr));
+        assert(isfinite(best_scale));
+        quantize_block_q4_0_reference(x + i * QK, y + i, best_scale);
    }
 }

@ -6564,17 +6657,28 @@ static void ggml_compute_forward_mul_mat_f16_f32(
    //}
 }

+static void quantize_row_q_missing(const float * x, void * y, int k) {
+    (void)x; (void)y; (void)k;
+    assert(false);
+}
+
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
    [GGML_TYPE_Q4_0] = {
        .dequantize_row_q         = dequantize_row_q4_0,
-        .quantize_row_q           = quantize_row_q4_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
+        .quantize_row_q           = {
+            [GGML_QUANTIZE_IMPL_SIMD]      = quantize_row_q4_0,
+            [GGML_QUANTIZE_IMPL_REFERENCE] = (quantize_row_q_t)quantize_row_q4_0_reference,
+            [GGML_QUANTIZE_IMPL_RMSE]      = (quantize_row_q_t)quantize_row_q4_0_rmse,
+        },
        .vec_dot_q                = ggml_vec_dot_q4_0,
    },
    [GGML_TYPE_Q4_1] = {
        .dequantize_row_q         = dequantize_row_q4_1,
-        .quantize_row_q           = quantize_row_q4_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
+        .quantize_row_q           = {
+            [GGML_QUANTIZE_IMPL_SIMD]      = quantize_row_q4_1,
+            [GGML_QUANTIZE_IMPL_REFERENCE] = quantize_row_q4_1_reference,
+            [GGML_QUANTIZE_IMPL_RMSE]      = quantize_row_q_missing,
+        },
        .vec_dot_q                = ggml_vec_dot_q4_1,
    },
 };
@ -6632,7 +6736,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
    GGML_ASSERT(ne3  == ne13);

    const enum ggml_type type = src0->type;
-    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q[GGML_QUANTIZE_IMPL_SIMD];
    vec_dot_q_t      const vec_dot_q      = quantize_fns[type].vec_dot_q;

    // we don't support permuted src0 or src1
@ -10602,7 +10706,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
    for (int j = 0; j < n; j += k) {
        block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK;

-        quantize_row_q4_0_reference(src + j, y, k);
+        quantize_row_q4_0_rmse(src + j, y, k);

        for (int i = 0; i < nb; i++) {
            for (int l = 0; l < QK; l += 2) {
--- a/ggml.h
+++ b/ggml.h
@ -788,20 +788,20 @@ int ggml_cpu_has_vsx(void);
 // Internal types and functions exposed for tests and benchmarks
 //

-#ifdef  __cplusplus
-// restrict not standard in C++
-#define GGML_RESTRICT
-#else
-#define GGML_RESTRICT restrict
-#endif
-typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+typedef void (*dequantize_row_q_t)(const void * x, float * y, int k);
+typedef void (*quantize_row_q_t)(const float * x, void * y, int k);
+typedef void (*vec_dot_q_t)(const int n, float * s, const void * x, const void * y);
+
+typedef enum {
+    GGML_QUANTIZE_IMPL_SIMD,
+    GGML_QUANTIZE_IMPL_REFERENCE,
+    GGML_QUANTIZE_IMPL_RMSE,
+    GGML_QUANTIZE_IMPL_COUNT
+} ggml_quantize_impl_t;

 typedef struct {
    dequantize_row_q_t dequantize_row_q;
-    quantize_row_q_t   quantize_row_q;
-    quantize_row_q_t   quantize_row_q_reference;
+    quantize_row_q_t   quantize_row_q[GGML_QUANTIZE_IMPL_COUNT];
    vec_dot_q_t        vec_dot_q;
 } quantize_fns_t;

--- a/llama.cpp
+++ b/llama.cpp
@ -644,7 +644,7 @@ static bool llama_model_load(
        size_t total_size = 0;
        model.n_loaded = 0;

-        while (true) {
+        while (size_t(fin.tellg()) + 12 < file_size) {
            int32_t n_dims;
            int32_t length;
            int32_t ftype;
@ -653,10 +653,6 @@ static bool llama_model_load(
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));

-            if (fin.eof()) {
-                break;
-            }
-
            int32_t nelements = 1;
            int32_t ne[2] = { 1, 1 };
            for (int i = 0; i < n_dims; ++i) {
@ -707,6 +703,10 @@ static bool llama_model_load(
            offset = (offset + 31) & -32;
            tensor->data = mm_addr + offset;
            fin.seekg(offset + tensor_data_size);
+            if (fin.eof()) {
+               fprintf(stderr, "%s: Truncated file?\n", __func__);
+               return false;
+            }
            total_size += tensor_data_size;
            model.n_loaded++;

@ -717,6 +717,15 @@ static bool llama_model_load(
            }
        }

+        uint32_t version_minor = 0;
+        fin.read((char *)&version_minor, sizeof(version_minor));
+        if (fin.eof() || version_minor < LLAMA_FILE_VERSION_MINOR) {
+            static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Provide a helpful message that explains why the user may want to update their files");
+            if (model.hparams.f16 == 2) {
+                fprintf(stderr, "%s: WARN no minor version detected - your file will work but consider re-creating it for better quantization\n", __func__);
+            }
+        }
+
        fin.close();

        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
@ -1572,6 +1581,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
        }
    }

+    static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Check if this condition needs updating for minimal model checksum changes");
+    if ((LLAMA_FILE_VERSION_MINOR > 1) || (itype == 2)) {
+        uint32_t version_minor = LLAMA_FILE_VERSION_MINOR;
+        fout.write((char *)&version_minor, sizeof(version_minor));
+    }
+
    finp.close();
    fout.close();

--- a/llama.h
+++ b/llama.h
@ -20,6 +20,7 @@
 #endif

 #define LLAMA_FILE_VERSION 1
+#define LLAMA_FILE_VERSION_MINOR 1 // for backward-compatible changes
 #define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
 #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files

--- a/tests/test-quantize.c
+++ b/tests/test-quantize.c
@ -13,18 +13,7 @@ int main(void) {
        src[i] = (float)(i + 1);
    }

-    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
-    assert(size == 20);
-    float max_result = ((float *)dst)[0];
-    float max_expected = src[31] / ((1 << 3) - 1);
-    assert(max_result == max_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
-        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
-        assert(q4_result == q4_expected);
-    }
-
-    size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
+    size_t size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
    assert(size == 24);
    float delta_result = ((float *)dst)[0];
    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);