ggml : fix quantize_row_q8_0() ARM_NEON rounding

2023-04-14 21:27:55 +03:00 · 2023-04-14 21:27:55 +03:00 · 59fb9e9eb8
commit 59fb9e9eb8
parent 801aab14aa
1 changed files with 1 additions and 2 deletions
--- a/ggml.c
+++ b/ggml.c
@ -1093,8 +1093,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int

        for (int l = 0; l < 8; l++) {
            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            //TODO: rounding
-            const int32x4_t   vi = vcvtq_s32_f32(v);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);

            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);