better quantize_row_q8_K

Co-authored-by: camel-cdr <camel-cdr@protonmail.com>
This commit is contained in:
Xuan Son Nguyen 2025-02-02 12:04:55 +01:00
parent 9517aee23c
commit 10dacabbcd

View file

@ -1660,33 +1660,22 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float * x_block = x + i * QK_K; const float * x_block = x + i * QK_K;
v128_t amax_vec = wasm_f32x4_splat(0.0f);
v128_t max_vec = wasm_f32x4_splat(0.0f);
// Vectorized max abs value search v128_t min_vec = wasm_v128_load(x_block);
for (int j = 0; j < QK_K; j += 4) { v128_t max_vec = min_vec;
for (int j = 4; j < QK_K; j += 4) {
v128_t x_vec = wasm_v128_load(x_block + j); v128_t x_vec = wasm_v128_load(x_block + j);
v128_t abs_x = wasm_f32x4_abs(x_vec); max_vec = wasm_f32x4_pmax(max_vec, x_vec);
v128_t mask = wasm_f32x4_gt(abs_x, amax_vec); min_vec = wasm_f32x4_pmin(min_vec, x_vec);
amax_vec = wasm_v128_bitselect(abs_x, amax_vec, mask);
max_vec = wasm_v128_bitselect(x_vec, max_vec, mask);
} }
max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
// Manual unroll for lane extraction max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
float amax = wasm_f32x4_extract_lane(amax_vec, 0); min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
float max_val = wasm_f32x4_extract_lane(max_vec, 0); min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
#define UPDATE_MAX(lane) \ float max = wasm_f32x4_extract_lane(max_vec, 0);
{ \ float min = wasm_f32x4_extract_lane(min_vec, 0);
float a = wasm_f32x4_extract_lane(amax_vec, lane); \ float amax = -min > max ? min : max;
if (a > amax) { \
amax = a; \
max_val = wasm_f32x4_extract_lane(max_vec, lane); \
} \
}
UPDATE_MAX(1)
UPDATE_MAX(2)
UPDATE_MAX(3)
#undef UPDATE_MAX
if (amax == 0.0f) { if (amax == 0.0f) {
yc[i].d = 0.0f; yc[i].d = 0.0f;