diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6ec7ce67d..924058fd8 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -24,6 +24,7 @@ struct quantize_stats_params { bool verbose = false; bool per_layer_stats = false; bool print_histogram = false; + bool reference = false; std::vector include_layers; std::vector exclude_layers; std::vector include_types; @@ -49,6 +50,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -r, --reference\n"); + fprintf(stderr, " use reference implementation (default: false)\n"); fprintf(stderr, " -v, --verbose\n"); fprintf(stderr, " verbose output (default: false)\n"); fprintf(stderr, " -p, --per-layer-stats\n"); @@ -135,6 +138,7 @@ void test_roundtrip_on_layer( std::string & name, bool print_layer_stats, const quantize_fns_t & qfns, + bool use_reference, const ggml_tensor * layer, float * input_scratch, char *quantized_scratch, @@ -156,7 +160,11 @@ void test_roundtrip_on_layer( input_scratch = ggml_get_data_f32(layer) + offset; } - qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + if (use_reference) { + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); + } else { + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + } qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); update_error_stats(chunk_size, input_scratch, output_scratch, total_error); @@ -184,6 +192,8 @@ int main(int argc, char ** argv) { if (arg == "-h" || arg == "--help") { quantize_stats_print_usage(argc, argv); exit(0); + } else if (arg == "-r" || arg == "--reference") { + params.reference = true; } else if (arg == "-v") { params.verbose = true; } else if (arg == "-p" || arg == "--per-layer-stats") { @@ -320,6 +330,7 @@ int main(int argc, char ** argv) { layer_name, params.per_layer_stats, qfns, + params.reference, kv_tensor.second, input_scratch.data(), quantized_scratch.data(), diff --git a/ggml.c b/ggml.c index 3a28616ff..de986e591 100644 --- a/ggml.c +++ b/ggml.c @@ -6499,14 +6499,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .vec_dot_q = ggml_vec_dot_q4_0, + .dequantize_row_q = dequantize_row_q4_0, + .quantize_row_q = quantize_row_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .vec_dot_q = ggml_vec_dot_q4_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .vec_dot_q = ggml_vec_dot_q4_1, + .dequantize_row_q = dequantize_row_q4_1, + .quantize_row_q = quantize_row_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .vec_dot_q = ggml_vec_dot_q4_1, }, }; diff --git a/ggml_internal.h b/ggml_internal.h index 0761bad3e..6bfa441d5 100644 --- a/ggml_internal.h +++ b/ggml_internal.h @@ -15,6 +15,7 @@ typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restri typedef struct { dequantize_row_q_t dequantize_row_q; quantize_row_q_t quantize_row_q; + quantize_row_q_t quantize_row_q_reference; vec_dot_q_t vec_dot_q; } quantize_fns_t;