diff --git a/ggml-quants.c b/ggml-quants.c index 9e62a3f32..0688eec5c 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1109,7 +1109,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * float ax = fabsf(x[i]); if (ax > amax) { amax = ax; max = x[i]; } } - if (amax < 1e-30f) { // all zero + if (amax < 1e-20f) { // all zero for (int i = 0; i < n; ++i) { L[i] = 0; } @@ -1177,7 +1177,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * float ax = fabsf(x[i]); if (ax > amax) { amax = ax; max = x[i]; } } - if (!amax) { // all zero + if (amax < 1e20f) { // all zero for (int i = 0; i < n; ++i) { L[i] = 0; } return 0.f; } @@ -2653,7 +2653,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } - if (!max_abs_scale) { + if (max_abs_scale < 1e-20f) { memset(&y[i], 0, sizeof(block_q6_K)); y[i].d = GGML_FP32_TO_FP16(0.f); x += QK_K; @@ -2805,7 +2805,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri } - if (!max_abs_scale) { + if (max_abs_scale < 1e-20f) { memset(&y[i], 0, sizeof(block_q6_K)); y[i].d = GGML_FP32_TO_FP16(0.f); x += QK_K; @@ -13213,7 +13213,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v } float max = xval[0]; for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]); - if (!max) { + if (max < 1e-20f) { scales[ib] = 0; memset(L, 0, 32); continue; @@ -13941,7 +13941,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } float max = fabsf(xb[0]); for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); - if (!max) { + if (max < 1e-20f) { scales[ib] = 0; memset(L, 1, block_size); continue; @@ -14205,7 +14205,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block amax = ax; max = xb[j]; } } - if (!amax) { + if (amax < 1e-20f) { scales[ib] = 0; continue; } @@ -14426,7 +14426,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy } float max = xval[0]; for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]); - if (!max) { + if (max < 1e-20f) { scales[ib] = 0; continue; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 85ef21c2a..94cfa9c93 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -49,6 +49,15 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } +#if 0 + // test quantization with very small values that may result in nan scales due to division by zero + if (ggml_is_quantized(tensor->type)) { + for (int i = 0; i < 256; i++) { + data[i] = 1e-24f; + } + } +#endif + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {