diff --git a/.gitignore b/.gitignore index 1c75d38d1..3b8d02c30 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ models/* /main /quantize +/quantize-stats /result /perplexity /embedding diff --git a/Makefile b/Makefile index 2f828bf10..6745cdeef 100644 --- a/Makefile +++ b/Makefile @@ -148,7 +148,7 @@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize perplexity embedding + rm -vf *.o main quantize quantize-stats perplexity embedding main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -159,6 +159,9 @@ main: examples/main/main.cpp ggml.o llama.o common.o quantize: examples/quantize/quantize.cpp ggml.o llama.o $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) + perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ce3a34710..67a7cea54 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -31,6 +31,7 @@ if (EMSCRIPTEN) else() add_subdirectory(main) add_subdirectory(quantize) + add_subdirectory(quantize-stats) add_subdirectory(perplexity) add_subdirectory(embedding) endif() diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt new file mode 100644 index 000000000..7bebc11a1 --- /dev/null +++ b/examples/quantize-stats/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize-stats) +add_executable(${TARGET} quantize-stats.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp new file mode 100644 index 000000000..e922c3a7a --- /dev/null +++ b/examples/quantize-stats/quantize-stats.cpp @@ -0,0 +1,321 @@ +#include "ggml.h" +#include "ggml_internal.h" +#include "llama.h" +#include "llama_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; +static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); + +struct quantize_stats_params { + std::string model = "models/7B/ggml-model-f16.bin"; + bool verbose = false; + bool per_layer_stats = false; + bool print_histogram = false; + std::vector include_layers; + std::vector exclude_layers; + std::vector include_types; +}; + +const size_t HISTOGRAM_BUCKETS = 30; +const double HISTOGRAM_RANGE = 0.03; + +struct error_stats { + size_t num_samples; + double total_error; + double max_error; + uint64_t error_histogram[HISTOGRAM_BUCKETS]; +}; + + +void quantize_stats_print_usage(int /*argc*/, char ** argv) { + quantize_stats_params params; + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -v, --verbose\n"); + fprintf(stderr, " verbose output (default: false)\n"); + fprintf(stderr, " -p, --per-layer-stats\n"); + fprintf(stderr, " print stats per layer (default: false)\n"); + fprintf(stderr, " --histogram\n"); + fprintf(stderr, " print error histogram (default: false)\n"); + fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); + fprintf(stderr, " only test layers containing substring\n"); + fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); + fprintf(stderr, " exclude layers containing substring\n"); + fprintf(stderr, " -t TYPE, --type TYPE\n"); + fprintf(stderr, " only test given type (q4_0, q4_1)\n"); + fprintf(stderr, "\n"); +} + +// Check if a layer is included/excluded by command line +bool layer_included(const quantize_stats_params params, const std::string & layer) { + for (const auto& excluded : params.exclude_layers) { + if (layer.find(excluded) != std::string::npos) { + return false; + } + } + for (const auto& included : params.include_layers) { + if (layer.find(included) != std::string::npos) { + return true; + } + } + return params.include_layers.empty(); +} + +// Update error statistics given vectors with the before/after result of quantization +void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { + for (int64_t i = 0; i < nelements; i++) { + double diff = input[i] - output[i]; + stats.total_error += diff * diff; + stats.max_error = fmax(fabs(diff), stats.max_error); + stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++; + } + stats.num_samples += nelements; +} + +void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { + printf("%-50s: mse %.8f, maxerr %.8f\n", name.c_str(), stats.total_error / (double) stats.num_samples, stats.max_error); + if (print_histogram) { + printf("Error distribution:\n"); + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; + printf("[%3.3f, %3.3f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); + } + } +} + +// copied from ggml.h - verify that we can access this as a flat array +static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +// Run quantization function for a single layer and update error stats +void test_roundtrip_on_layer( + std::string & name, + bool print_layer_stats, + const quantize_fns_t & qfns, + const ggml_tensor * layer, + float * input_scratch, + char *quantized_scratch, + float * output_scratch, + error_stats & total_error) { + + assert(tensor_is_contiguous(layer)); + int64_t nelements = ggml_nelements(layer); + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < nelements; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i); + } + } else { + input_scratch = ggml_get_data_f32(layer); + } + + qfns.quantize_row_q(input_scratch, quantized_scratch, nelements); + qfns.dequantize_row_q(quantized_scratch, output_scratch, nelements); + + update_error_stats(nelements, input_scratch, output_scratch, total_error); + if (print_layer_stats) { + error_stats layer_error {}; + update_error_stats(nelements, input_scratch, output_scratch, layer_error); + print_error_stats(name, layer_error, false); + } +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + quantize_stats_params params; + + // read command line + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-h" || arg == "--help") { + quantize_stats_print_usage(argc, argv); + exit(0); + } else if (arg == "-v") { + params.verbose = true; + } else if (arg == "-p" || arg == "--per-layer-stats") { + params.per_layer_stats = true; + } else if (arg == "--histogram") { + params.print_histogram = true; + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-l" || arg == "--include-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.include_layers.push_back(argv[i]); + } else if (arg == "-L" || arg == "--exclude-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.exclude_layers.push_back(argv[i]); + } else if (arg == "-t" || arg == "--type") { + if (++i >= argc) { + invalid_param = true; + break; + } + int j; + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) { + // find match + } + if (j < GGML_TYPE_COUNT) { + params.include_types.push_back((ggml_type) j); + } else { + fprintf(stderr, "error: %s not in list of types\n", argv[i]); + invalid_param = true; + } + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + + // load the model + fprintf(stderr, "Loading model\n"); + + const int64_t t_main_start_us = ggml_time_us(); + llama_context * ctx; + + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = 256; + lparams.n_parts = 1; + lparams.seed = 1; + lparams.f16_kv = false; + lparams.use_mlock = false; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // Sort tensors for consistent output + const auto tensors = llama_internal_get_tensor_map(ctx); + std::map tensors_sorted { tensors.begin(), tensors.end() }; + + // check layer tensors + int included_layers = 0; + int64_t max_nelements = 0; + bool is_f16 = false; + for (const auto& kv_tensor : tensors_sorted) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second)); + } + if (kv_tensor.second->type == GGML_TYPE_F16) { + is_f16 = true; + } else if (kv_tensor.second->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: error: Quantization should be tested with a float model, " + "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); + llama_free(ctx); + return 1; + } + included_layers++; + max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second)); + } + + if (is_f16) { + printf("note: source model is f16\n"); + } + printf("testing %d layers with max size %" PRId64 ", allocating %" PRId64 " bytes\n", included_layers, max_nelements, 3*4*max_nelements); + // allocate scratch space + std::vector input_scratch(max_nelements); + std::vector quantized_scratch(max_nelements*4); + std::vector output_scratch(max_nelements); + + // loop throught quantization types + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { + continue; + } + quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + if (qfns.quantize_row_q && qfns.dequantize_row_q) { + if (params.verbose) { + printf("testing %s ...\n", type_strs[i]); + } + + error_stats global_stats {}; + + for (const auto& kv_tensor : tensors_sorted) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf(" %s ...\n", kv_tensor.first.c_str()); + } + std::string layer_name { type_strs[i] }; + layer_name += "::" + kv_tensor.first; + test_roundtrip_on_layer( + layer_name, + params.per_layer_stats, + qfns, + kv_tensor.second, + input_scratch.data(), + quantized_scratch.data(), + output_scratch.data(), + global_stats + ); + } + + print_error_stats(type_strs[i], global_stats, params.print_histogram); + } + } + + + llama_free(ctx); + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); + } + + return 0; +} diff --git a/ggml.c b/ggml.c index 63aa5eb6e..3a28616ff 100644 --- a/ggml.c +++ b/ggml.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #include "ggml.h" +#include "ggml_internal.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -6496,16 +6497,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k); -typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k); -typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y); - -typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - vec_dot_q_t vec_dot_q; -} quantize_fns_t; - static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0, @@ -6519,6 +6510,12 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { }, }; +// For internal test use +quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { + GGML_ASSERT(i < GGML_TYPE_COUNT); + return quantize_fns[i]; +} + static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, diff --git a/ggml_internal.h b/ggml_internal.h new file mode 100644 index 000000000..0761bad3e --- /dev/null +++ b/ggml_internal.h @@ -0,0 +1,25 @@ +#pragma once + +// Internal functions exposed for tests and benchmarks + +#ifdef __cplusplus +// restrict not standard in C++ +#define restrict +extern "C" { +#endif + +typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k); +typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k); +typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y); + +typedef struct { + dequantize_row_q_t dequantize_row_q; + quantize_row_q_t quantize_row_q; + vec_dot_q_t vec_dot_q; +} quantize_fns_t; + +quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + +#ifdef __cplusplus +} +#endif diff --git a/llama.cpp b/llama.cpp index 854bb8993..b736dd88a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,4 +1,5 @@ #include "llama.h" +#include "llama_internal.h" #include "ggml.h" @@ -1854,3 +1855,8 @@ const char * llama_print_system_info(void) { return s.c_str(); } + +// For internal test use +std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx) { + return ctx->model.tensors; +} diff --git a/llama_internal.h b/llama_internal.h new file mode 100644 index 000000000..25c8c2c87 --- /dev/null +++ b/llama_internal.h @@ -0,0 +1,13 @@ +#ifndef LLAMA_INTERNAL_H +#define LLAMA_INTERNAL_H + +// Internal functions exposed for tests and benchmarks + +#include "ggml.h" + +#include +#include + +std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif