ggml-quants : use ceiling division when quantizing q1_3
This commit is contained in:
parent
9465ec6e12
commit
89dc3b254c
4 changed files with 12 additions and 7 deletions
|
@ -15,11 +15,13 @@
|
|||
|
||||
constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_BITNET = 0.015625f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR_BITNET = 0.5f;
|
||||
|
||||
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||
|
||||
|
@ -144,6 +146,8 @@ int main(int argc, char * argv[]) {
|
|||
if (qfns.from_float && qfns.to_float) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_Q1_3 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET :
|
||||
type == GGML_TYPE_Q2_2 ? MAX_QUANTIZATION_TOTAL_ERROR_BITNET :
|
||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
|
@ -166,6 +170,8 @@ int main(int argc, char * argv[]) {
|
|||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||
: type == GGML_TYPE_Q2_2 || type == GGML_TYPE_Q1_3
|
||||
? MAX_DOT_PRODUCT_ERROR_BITNET
|
||||
: MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue