iq1s_blocks16: Adjust scale fudge factor to 1.125

This commit is contained in:
Iwan Kawrakow 2024-03-08 13:38:45 +02:00
parent c9e9acf2be
commit cd83a7d362

View file

@ -11635,7 +11635,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
}
float d = max_scale/31;
y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
float id = 1/d;
for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ib += 2) {
int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));