From ed667e95816a74126252cca9a7f6390a3cf3ace0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= <haakon@likedan.net>
Date: Sun, 2 Apr 2023 15:59:14 +0200
Subject: [PATCH] quantize-stats command

Command that calculates some statistics over the errors introduced by
quantization, at the moment mean square error and max error for layer
weights. Should be useful for testing quantization improvements.

Needs some internal state from ggml and llama that should not be part of
the public API.
---
 .gitignore                                 |   1 +
 Makefile                                   |   5 +-
 examples/CMakeLists.txt                    |   1 +
 examples/quantize-stats/CMakeLists.txt     |   4 +
 examples/quantize-stats/quantize-stats.cpp | 321 +++++++++++++++++++++
 ggml.c                                     |  17 +-
 ggml_internal.h                            |  25 ++
 llama.cpp                                  |   6 +
 llama_internal.h                           |  13 +
 9 files changed, 382 insertions(+), 11 deletions(-)
 create mode 100644 examples/quantize-stats/CMakeLists.txt
 create mode 100644 examples/quantize-stats/quantize-stats.cpp
 create mode 100644 ggml_internal.h
 create mode 100644 llama_internal.h
diff --git a/.gitignore b/.gitignore
index 1c75d38d1..3b8d02c30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ models/*
 
 /main
 /quantize
+/quantize-stats
 /result
 /perplexity
 /embedding
diff --git a/Makefile b/Makefile
index 2f828bf10..6745cdeef 100644
--- a/Makefile
+++ b/Makefile
@@ -148,7 +148,7 @@ common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 
 clean:
-	rm -vf *.o main quantize perplexity embedding
+	rm -vf *.o main quantize quantize-stats perplexity embedding
 
 main: examples/main/main.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@@ -159,6 +159,9 @@ main: examples/main/main.cpp ggml.o llama.o common.o
 quantize: examples/quantize/quantize.cpp ggml.o llama.o
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
 
+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
+	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ce3a34710..67a7cea54 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -31,6 +31,7 @@ if (EMSCRIPTEN)
 else()
     add_subdirectory(main)
     add_subdirectory(quantize)
+    add_subdirectory(quantize-stats)
     add_subdirectory(perplexity)
     add_subdirectory(embedding)
 endif()
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
new file mode 100644
index 000000000..7bebc11a1
--- /dev/null
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize-stats)
+add_executable(${TARGET} quantize-stats.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
new file mode 100644
index 000000000..e922c3a7a
--- /dev/null
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -0,0 +1,321 @@
+#include "ggml.h"
+#include "ggml_internal.h"
+#include "llama.h"
+#include "llama_internal.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
+static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
+
+struct quantize_stats_params {
+    std::string model = "models/7B/ggml-model-f16.bin";
+    bool verbose = false;
+    bool per_layer_stats = false;
+    bool print_histogram = false;
+    std::vector<std::string> include_layers;
+    std::vector<std::string> exclude_layers;
+    std::vector<enum ggml_type> include_types;
+};
+
+const size_t HISTOGRAM_BUCKETS = 30;
+const double HISTOGRAM_RANGE = 0.03;
+
+struct error_stats {
+    size_t num_samples;
+    double total_error;
+    double max_error;
+    uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+
+
+void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+    quantize_stats_params params;
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -v, --verbose\n");
+    fprintf(stderr, "                        verbose output (default: false)\n");
+    fprintf(stderr, "  -p, --per-layer-stats\n");
+    fprintf(stderr, "                        print stats per layer (default: false)\n");
+    fprintf(stderr, "  --histogram\n");
+    fprintf(stderr, "                        print error histogram (default: false)\n");
+    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
+    fprintf(stderr, "                        only test layers containing substring\n");
+    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
+    fprintf(stderr, "                        exclude layers containing substring\n");
+    fprintf(stderr, "  -t TYPE, --type TYPE\n");
+    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
+    fprintf(stderr, "\n");
+}
+
+// Check if a layer is included/excluded by command line
+bool layer_included(const quantize_stats_params params, const std::string & layer) {
+    for (const auto& excluded : params.exclude_layers) {
+        if (layer.find(excluded) != std::string::npos) {
+            return false;
+        }
+    }
+    for (const auto& included : params.include_layers) {
+        if (layer.find(included) != std::string::npos) {
+            return true;
+        }
+    }
+    return params.include_layers.empty();
+}
+
+// Update error statistics given vectors with the before/after result of quantization
+void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+    for (int64_t i = 0; i < nelements; i++) {
+        double diff = input[i] - output[i];
+        stats.total_error += diff * diff;
+        stats.max_error = fmax(fabs(diff), stats.max_error);
+        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+    }
+    stats.num_samples += nelements;
+}
+
+void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+    printf("%-50s: mse %.8f, maxerr %.8f\n", name.c_str(), stats.total_error / (double) stats.num_samples, stats.max_error);
+    if (print_histogram) {
+        printf("Error distribution:\n");
+        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+            printf("[%3.3f, %3.3f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+        }
+    }
+}
+
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+// Run quantization function for a single layer and update error stats
+void test_roundtrip_on_layer(
+        std::string & name,
+        bool print_layer_stats,
+        const quantize_fns_t & qfns,
+        const ggml_tensor * layer,
+        float * input_scratch,
+        char *quantized_scratch,
+        float * output_scratch,
+        error_stats & total_error) {
+
+    assert(tensor_is_contiguous(layer));
+    int64_t nelements = ggml_nelements(layer);
+
+    if (layer->type == GGML_TYPE_F16) {
+        for (int i = 0; i < nelements; i++) {
+            input_scratch[i] = ggml_get_f32_1d(layer, i);
+        }
+    } else {
+        input_scratch = ggml_get_data_f32(layer);
+    }
+
+    qfns.quantize_row_q(input_scratch, quantized_scratch, nelements);
+    qfns.dequantize_row_q(quantized_scratch, output_scratch, nelements);
+
+    update_error_stats(nelements, input_scratch, output_scratch, total_error);
+    if (print_layer_stats) {
+        error_stats layer_error {};
+        update_error_stats(nelements, input_scratch, output_scratch, layer_error);
+        print_error_stats(name, layer_error, false);
+    }
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    quantize_stats_params params;
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            quantize_stats_print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-v") {
+            params.verbose = true;
+        } else if (arg == "-p" || arg == "--per-layer-stats") {
+            params.per_layer_stats = true;
+        } else if (arg == "--histogram") {
+            params.print_histogram = true;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-l" || arg == "--include-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_layers.push_back(argv[i]);
+        } else if (arg == "-L" || arg == "--exclude-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.exclude_layers.push_back(argv[i]);
+        } else if (arg == "-t" || arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
+                // find match
+            }
+            if (j < GGML_TYPE_COUNT) {
+                params.include_types.push_back((ggml_type) j);
+            } else {
+                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+                invalid_param = true;
+            }
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            quantize_stats_print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        quantize_stats_print_usage(argc, argv);
+        return 1;
+    }
+
+    // load the model
+    fprintf(stderr, "Loading model\n");
+
+    const int64_t t_main_start_us = ggml_time_us();
+    llama_context * ctx;
+
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.n_ctx      = 256;
+        lparams.n_parts    = 1;
+        lparams.seed       = 1;
+        lparams.f16_kv     = false;
+        lparams.use_mlock  = false;
+
+        ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+    }
+
+    // Sort tensors for consistent output
+    const auto tensors = llama_internal_get_tensor_map(ctx);
+    std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
+
+    // check layer tensors
+    int included_layers = 0;
+    int64_t max_nelements = 0;
+    bool is_f16 = false;
+    for (const auto& kv_tensor : tensors_sorted) {
+        if (!layer_included(params, kv_tensor.first)) {
+            continue;
+        }
+        if (params.verbose) {
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
+        }
+        if (kv_tensor.second->type == GGML_TYPE_F16) {
+            is_f16 = true;
+        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+            llama_free(ctx);
+            return 1;
+        }
+        included_layers++;
+        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+    }
+
+    if (is_f16) {
+        printf("note: source model is f16\n");
+    }
+    printf("testing %d layers with max size %" PRId64 ", allocating %" PRId64 " bytes\n", included_layers, max_nelements, 3*4*max_nelements);
+    // allocate scratch space
+    std::vector<float> input_scratch(max_nelements);
+    std::vector<char> quantized_scratch(max_nelements*4);
+    std::vector<float> output_scratch(max_nelements);
+
+    // loop throught quantization types
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+            continue;
+        }
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            if (params.verbose) {
+                printf("testing %s ...\n",  type_strs[i]);
+            }
+
+            error_stats global_stats {};
+
+            for (const auto& kv_tensor : tensors_sorted) {
+                if (!layer_included(params, kv_tensor.first)) {
+                    continue;
+                }
+                if (params.verbose) {
+                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                }
+                std::string layer_name { type_strs[i] };
+                layer_name += "::" + kv_tensor.first;
+                test_roundtrip_on_layer(
+                        layer_name,
+                        params.per_layer_stats,
+                        qfns,
+                        kv_tensor.second,
+                        input_scratch.data(),
+                        quantized_scratch.data(),
+                        output_scratch.data(),
+                        global_stats
+                );
+            }
+
+            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+        }
+    }
+
+
+    llama_free(ctx);
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    return 0;
+}
diff --git a/ggml.c b/ggml.c
index 63aa5eb6e..3a28616ff 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2,6 +2,7 @@
 #define _GNU_SOURCE
 
 #include "ggml.h"
+#include "ggml_internal.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -6496,16 +6497,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     //}
 }
 
-typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
-typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
-typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
-
-typedef struct {
-    dequantize_row_q_t dequantize_row_q;
-    quantize_row_q_t   quantize_row_q;
-    vec_dot_q_t        vec_dot_q;
-} quantize_fns_t;
-
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = {
         .dequantize_row_q = dequantize_row_q4_0,
@@ -6519,6 +6510,12 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
     },
 };
 
+// For internal test use
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
+    GGML_ASSERT(i < GGML_TYPE_COUNT);
+    return quantize_fns[i];
+}
+
 static void ggml_compute_forward_mul_mat_q_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
diff --git a/ggml_internal.h b/ggml_internal.h
new file mode 100644
index 000000000..0761bad3e
--- /dev/null
+++ b/ggml_internal.h
@@ -0,0 +1,25 @@
+#pragma once
+
+// Internal functions exposed for tests and benchmarks
+
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define restrict
+extern "C" {
+#endif
+
+typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
+typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
+typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
+
+typedef struct {
+    dequantize_row_q_t dequantize_row_q;
+    quantize_row_q_t   quantize_row_q;
+    vec_dot_q_t        vec_dot_q;
+} quantize_fns_t;
+
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/llama.cpp b/llama.cpp
index 854bb8993..b736dd88a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1,4 +1,5 @@
 #include "llama.h"
+#include "llama_internal.h"
 
 #include "ggml.h"
 
@@ -1854,3 +1855,8 @@ const char * llama_print_system_info(void) {
 
     return s.c_str();
 }
+
+// For internal test use
+std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx) {
+    return ctx->model.tensors;
+}
diff --git a/llama_internal.h b/llama_internal.h
new file mode 100644
index 000000000..25c8c2c87
--- /dev/null
+++ b/llama_internal.h
@@ -0,0 +1,13 @@
+#ifndef LLAMA_INTERNAL_H
+#define LLAMA_INTERNAL_H
+
+// Internal functions exposed for tests and benchmarks
+
+#include "ggml.h"
+
+#include <string>
+#include <unordered_map>
+
+std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif