diff --git a/Makefile b/Makefile index a1b99c6f9..e7470d51a 100644 --- a/Makefile +++ b/Makefile @@ -133,7 +133,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize perplexity embedding +default: main quantize quantize-stats perplexity embedding # # Build library diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 050300931..597943958 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -16,6 +16,9 @@ #include #include +static const char * type_strs[] = { "f32", "f16", "q4_0", "q4_1", "q8_0", "i8", "i16", "i32", }; +static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); + struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; bool verbose = false; diff --git a/ggml.c b/ggml.c index 3e8e80570..0a8aa828b 100644 --- a/ggml.c +++ b/ggml.c @@ -7151,14 +7151,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q8_0, + .quantize_row_q = quantize_row_q4_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .quantize_row_q_dot = quantize_row_q8_0, .vec_dot_q = ggml_vec_dot_q4_0_q8_0, }, [GGML_TYPE_Q4_1] = { .dequantize_row_q = dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .quantize_row_q_dot = quantize_row_q8_0, .vec_dot_q = ggml_vec_dot_q4_1, }, // TODO: GGML_TYPE_Q8_0 @@ -7217,8 +7219,8 @@ static void ggml_compute_forward_mul_mat_q_f32( GGML_ASSERT(ne3 == ne13); const enum ggml_type type = src0->type; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; - vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; + quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; + vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); @@ -7292,7 +7294,7 @@ static void ggml_compute_forward_mul_mat_q_f32( for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { - quantize_row_q((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); wdata += row_size; } } diff --git a/ggml.h b/ggml.h index 8dc28a6a8..241e96a19 100644 --- a/ggml.h +++ b/ggml.h @@ -837,6 +837,7 @@ typedef struct { dequantize_row_q_t dequantize_row_q; quantize_row_q_t quantize_row_q; quantize_row_q_t quantize_row_q_reference; + quantize_row_q_t quantize_row_q_dot; vec_dot_q_t vec_dot_q; } quantize_fns_t;