From 3b6199ba3cc7b050a1054ae1e6dfd8c1accbad17 Mon Sep 17 00:00:00 2001 From: junchao-loongson Date: Sat, 18 May 2024 15:35:51 +0800 Subject: [PATCH] format code --- ggml-quants.c | 200 ++++++++++++++++++-------------------------------- 1 file changed, 73 insertions(+), 127 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 0126c5d96..e10315678 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -264,21 +264,19 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #if defined(__loongarch_asx) -typedef union -{ +typedef union { int32_t i; float f; -} FloatInt; +} ft_union; + /* float type data load instructions */ -static __m128 __lsx_vreplfr2vr_s(float val) -{ - FloatInt fi_tmpval = {.f = val}; +static __m128 __lsx_vreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); } -static __m256 __lasx_xvreplfr2vr_s(float val) -{ - FloatInt fi_tmpval = {.f = val}; +static __m256 __lasx_xvreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); } @@ -291,8 +289,7 @@ static __m256 __lasx_xvreplfr2vr_s(float val) #endif #define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31" // Convert __m128i to __m256i -static inline __m256i ____m256i(__m128i in) -{ +static inline __m256i ____m256i(__m128i in) { __m256i out = __lasx_xvldi(0); __asm__ volatile ( ".irp i," __ALL_REGS "\n\t" @@ -309,8 +306,7 @@ static inline __m256i ____m256i(__m128i in) return out; } // Convert two __m128i to __m256i -static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) -{ +static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) { __m256i out; __asm__ volatile ( ".irp i," __ALL_REGS "\n\t" @@ -339,8 +335,7 @@ static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) return out; } // Convert __m256i low part to __m128i -static inline __m128i lasx_extracti128_lo(__m256i in) -{ +static inline __m128i lasx_extracti128_lo(__m256i in) { __m128i out; __asm__ volatile ( ".ifnc %[out], %[in] \n\t" @@ -359,8 +354,7 @@ static inline __m128i lasx_extracti128_lo(__m256i in) return out; } // Convert __m256i high part to __m128i -static inline __m128i lasx_extracti128_hi(__m256i in) -{ +static inline __m128i lasx_extracti128_hi(__m256i in) { __m128i out; __asm__ volatile ( ".irp i," __ALL_REGS "\n\t" @@ -377,32 +371,28 @@ static inline __m128i lasx_extracti128_hi(__m256i in) return out; } -static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) -{ +static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) { v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7}; return (__m256i)__ret; } -static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) -{ +static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) { v4i32 __ret = {d, c, b, a}; return (__m128i)__ret; } -static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) -{ + +static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) { v4i64 __ret = {d, c, b, a}; return (__m256i)__ret; } -static __m256i lasx_insertf128( __m128i x, __m128i y) -{ +static __m256i lasx_insertf128( __m128i x, __m128i y) { return lasx_set_q(x, y); } #undef MM256_SET_M128I #define MM256_SET_M128I(a, b) lasx_insertf128((a), (b)) -static __m128i lsx_shuffle_b(__m128i a, __m128i b) -{ +static __m128i lsx_shuffle_b(__m128i a, __m128i b) { __m128i mask_f, zero, tmp0, tmp2, mask; int f = 0x8f; mask_f = __lsx_vreplgr2vr_b(f); @@ -414,8 +404,7 @@ static __m128i lsx_shuffle_b(__m128i a, __m128i b) return __lsx_vshuf_b(a, zero, tmp2); } -static __m256i lasx_shuffle_b(__m256i a, __m256i b) -{ +static __m256i lasx_shuffle_b(__m256i a, __m256i b) { __m256i mask_f, zero, tmp0, tmp2, mask; int f = 0x8f; mask_f = __lasx_xvreplgr2vr_b(f); @@ -427,24 +416,21 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) return __lasx_xvshuf_b(a, zero, tmp2); } -static __m256i lasx_extu8_16(__m128i a) -{ +static __m256i lasx_extu8_16(__m128i a) { __m128i zero = __lsx_vldi(0); __m128i vlo = __lsx_vilvl_b(zero, a); __m128i vhi = __lsx_vilvh_b(zero, a); return lasx_set_q(vhi, vlo); } -static __m256i lasx_ext8_16(__m128i a) -{ +static __m256i lasx_ext8_16(__m128i a) { __m128i sign = __lsx_vslti_b(a, 0); __m128i vlo = __lsx_vilvl_b(sign, a); __m128i vhi = __lsx_vilvh_b(sign, a); return lasx_set_q(vhi, vlo); } -static __m256i lasx_ext16_32(__m128i a) -{ +static __m256i lasx_ext16_32(__m128i a) { __m256i tmp1; tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0); tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1); @@ -457,8 +443,7 @@ static __m256i lasx_ext16_32(__m128i a) return tmp1; } -static __m128i lasx_extracti128( __m256i a, int pos) -{ +static __m128i lasx_extracti128( __m256i a, int pos) { __m128i ret; if( pos == 0) { @@ -469,8 +454,7 @@ static __m128i lasx_extracti128( __m256i a, int pos) return ret; } -static __m128 lasx_extractf128( __m256 a, int pos) -{ +static __m128 lasx_extractf128( __m256 a, int pos) { __m128 ret; if( pos == 0) { @@ -481,78 +465,68 @@ static __m128 lasx_extractf128( __m256 a, int pos) return ret; } -static __m128i lsx_hadd_h(__m128i a, __m128i b) -{ +static __m128i lsx_hadd_h(__m128i a, __m128i b) { __m128i tmp1 = __lsx_vpickev_h(b, a); __m128i tmp2 = __lsx_vpickod_h(b, a); return __lsx_vadd_h(tmp1, tmp2); } -static __m128i lsx_hadd_w(__m128i a, __m128i b) -{ +static __m128i lsx_hadd_w(__m128i a, __m128i b) { __m128i tmp1 = __lsx_vpickev_w(b, a); __m128i tmp2 = __lsx_vpickod_w(b, a); return __lsx_vadd_w(tmp1, tmp2); } -static __m128 lsx_hadd_s(__m128 a, __m128 b) -{ +static __m128 lsx_hadd_s(__m128 a, __m128 b) { __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a); __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a); return __lsx_vfadd_s(tmp1, tmp2); } -static __m256i lasx_maddubs_h(__m256i a, __m256i b) -{ +static __m256i lasx_maddubs_h(__m256i a, __m256i b) { __m256i tmp1, tmp2; tmp1 = __lasx_xvmulwev_h_b(a, b); tmp2 = __lasx_xvmulwod_h_b(a, b); return __lasx_xvsadd_h(tmp1, tmp2); } -static __m256i lasx_madd_h(__m256i a, __m256i b) -{ +static __m256i lasx_madd_h(__m256i a, __m256i b) { __m256i tmp1, tmp2; tmp1 = __lasx_xvmulwev_w_h(a, b); tmp2 = __lasx_xvmulwod_w_h(a, b); return __lasx_xvadd_w(tmp1, tmp2); } -static __m256i lasx_packs_w(__m256i a, __m256i b) -{ +static __m256i lasx_packs_w(__m256i a, __m256i b) { __m256i tmp, tmp1; tmp = __lasx_xvsat_w(a, 15); tmp1 = __lasx_xvsat_w(b, 15); return __lasx_xvpickev_h(tmp1, tmp); } -static __m256i lasx_packs_h(__m256i a, __m256i b) -{ +static __m256i lasx_packs_h(__m256i a, __m256i b) { __m256i tmp, tmp1; tmp = __lasx_xvsat_h(a, 7); tmp1 = __lasx_xvsat_h(b, 7); return __lasx_xvpickev_b(tmp1, tmp); } -static __m128i lsx_packs_w(__m128i a, __m128i b) -{ +static __m128i lsx_packs_w(__m128i a, __m128i b) { __m128i tmp, tmp1; tmp = __lsx_vsat_w(a, 15); tmp1 = __lsx_vsat_w(b, 15); return __lsx_vpickev_h(tmp1, tmp); } -static __m128i lsx_packs_h(__m128i a, __m128i b) -{ +static __m128i lsx_packs_h(__m128i a, __m128i b) { __m128i tmp, tmp1; tmp = __lsx_vsat_h(a, 7); tmp1 = __lsx_vsat_h(b, 7); return __lsx_vpickev_b(tmp1, tmp); } -static __m128i lsx_packus_h(__m128i a, __m128i b) -{ +static __m128i lsx_packus_h(__m128i a, __m128i b) { __m128i tmp, tmp1; tmp = __lsx_vsat_hu(a, 7); tmp1 = __lsx_vsat_hu(b, 7); @@ -560,16 +534,14 @@ static __m128i lsx_packus_h(__m128i a, __m128i b) } -static __m128i lsx_maddubs_h(__m128i a, __m128i b) -{ +static __m128i lsx_maddubs_h(__m128i a, __m128i b) { __m128i tmp1, tmp2; tmp1 = __lsx_vmulwev_h_b(a, b); tmp2 = __lsx_vmulwod_h_b(a, b); return __lsx_vsadd_h(tmp1, tmp2); } -static __m128i lsx_madd_h(__m128i a, __m128i b) -{ +static __m128i lsx_madd_h(__m128i a, __m128i b) { __m128i tmp1, tmp2; tmp1 = __lsx_vmulwev_w_h(a, b); tmp2 = __lsx_vmulwod_w_h(a, b); @@ -591,7 +563,7 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { // horizontally add 8 floats static inline float hsum_float_8(const __m256 x) { __m128 res = lasx_extractf128(x, 1); - FloatInt tmp; + ft_union tmp; res = __lsx_vfadd_s(res, lasx_extractf128(x, 0)); res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res)); res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0)); @@ -651,8 +623,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) { // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ +static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { const __m128i lo = __lsx_vld((const __m128i *)rsi, 0); __m128i hi = __lsx_vsrli_h(lo, 4); return __lasx_xvandi_b(MM256_SET_M128I(hi, lo), 0xf); @@ -682,8 +653,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { return mul_sum_us8_pairs_float(ax, sy); } -static inline __m128i packNibbles( __m256i bytes ) -{ +static inline __m128i packNibbles( __m256i bytes ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF); __m256i high = __lasx_xvandn_v(lowByte, bytes); @@ -1129,7 +1099,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) #elif defined(__loongarch_asx) for (int i = 0; i < nb; i++) { - FloatInt fi; + ft_union fi; __m256 v0 = (__m256)__lasx_xvld( x , 0); __m256 v1 = (__m256)__lasx_xvld( x , 32); __m256 v2 = (__m256)__lasx_xvld( x , 64); @@ -1137,23 +1107,23 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) x += 32; // Compute max(abs(e)) for the block - const __m256 signBit = __lasx_xvreplfr2vr_s( -0.0f ); - __m256 maxAbs = (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v0 ); - maxAbs = __lasx_xvfmax_s( maxAbs, (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v1 ) ); - maxAbs = __lasx_xvfmax_s( maxAbs, (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v2 ) ); - maxAbs = __lasx_xvfmax_s( maxAbs, (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v3 ) ); + const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); + __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); - __m128 max4 = __lsx_vfmax_s( lasx_extractf128( maxAbs, 1 ), lasx_extractf128( maxAbs , 0) ); + __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) ); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); __m128 tmp = max4; max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 )); fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); - const float maxScalar = fi.f; + const float max_scalar = fi.f; // Quantize these floats - const float d = maxScalar / 127.f; + const float d = max_scalar / 127.f; y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id ); // Apply the multiplier @@ -1337,12 +1307,12 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); + const float max_scalar = _mm_cvtss_f32( max4 ); // Quantize these floats - const float d = maxScalar / 127.f; + const float d = max_scalar / 127.f; y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier @@ -1488,7 +1458,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) #elif defined(__loongarch_asx) for (int i = 0; i < nb; i++) { - FloatInt ft; + ft_union ft; __m256 v0 = (__m256)__lasx_xvld( x , 0 ); __m256 v1 = (__m256)__lasx_xvld( x , 32 ); __m256 v2 = (__m256)__lasx_xvld( x , 64 ); @@ -1496,23 +1466,23 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) x += 32; // Compute max(abs(e)) for the block - const __m256 signBit = __lasx_xvreplfr2vr_s( -0.0f ); - __m256 maxAbs = (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v0 ); - maxAbs = __lasx_xvfmax_s( maxAbs, (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v1 ) ); - maxAbs = __lasx_xvfmax_s( maxAbs, (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v2 ) ); - maxAbs = __lasx_xvfmax_s( maxAbs, (__m256)__lasx_xvandn_v( (__m256i)signBit, (__m256i)v3 ) ); + const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f ); + __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) ); + max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) ); - __m128 max4 = __lsx_vfmax_s( lasx_extractf128( maxAbs, 1 ), lasx_extractf128( maxAbs, 0) ); + __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) ); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); __m128 tmp = max4; max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); - const float maxScalar = ft.f; + const float max_scalar = ft.f; // Quantize these floats - const float d = maxScalar / 127.f; + const float d = max_scalar / 127.f; y[i].d = GGML_FP32_TO_FP16(d); - const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = __lasx_xvreplfr2vr_s( id ); // Apply the multiplier @@ -4501,7 +4471,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r *s = hsum_float_8(acc); #elif defined(__loongarch_sx) // set constants - const __m128i lowMask = __lsx_vreplgr2vr_b(0xF); + const __m128i low_mask = __lsx_vreplgr2vr_b(0xF); const __m128i off = __lsx_vreplgr2vr_b(8); // Initialize accumulator with zeros @@ -4520,30 +4490,27 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[0].qs, 0); - __m128i bx_0 = __lsx_vand_v(lowMask, tmp_0_1); + __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1); __m128i by_0 = __lsx_vld((const __m128i *)y[0].qs, 0); bx_0 = __lsx_vsub_b(bx_0, off); const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - __m128i bx_1 = __lsx_vand_v(lowMask, __lsx_vsrli_d(tmp_0_1, 4)); + __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4)); __m128i by_1 = __lsx_vld((const __m128i *)(y[0].qs + 16), 0); bx_1 = __lsx_vsub_b(bx_1, off); const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); - //_mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); - //_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); - // Compute combined scale for the block 2 and 3 const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) ); const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[1].qs, 0); - __m128i bx_2 = __lsx_vand_v(lowMask, tmp_2_3); + __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3); __m128i by_2 = __lsx_vld((const __m128i *)y[1].qs, 0); bx_2 = __lsx_vsub_b(bx_2, off); const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - __m128i bx_3 = __lsx_vand_v(lowMask, __lsx_vsrli_d(tmp_2_3, 4)); + __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4)); __m128i by_3 = __lsx_vld((const __m128i *)(y[1].qs + 16), 0); bx_3 = __lsx_vsub_b(bx_3, off); const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); @@ -4565,20 +4532,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r // Main loop for (int i = 2; i < nb; i+=2) { - //_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); - //_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[i].qs, 0); - __m128i bx_0 = __lsx_vand_v(lowMask, tmp_0_1); + __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1); __m128i by_0 = __lsx_vld((const __m128i *)y[i].qs, 0); bx_0 = __lsx_vsub_b(bx_0, off); const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - __m128i bx_1 = __lsx_vand_v(lowMask, __lsx_vsrli_d(tmp_0_1, 4)); + __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4)); __m128i by_1 = __lsx_vld((const __m128i *)(y[i].qs + 16), 0); bx_1 = __lsx_vsub_b(bx_1, off); const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); @@ -4591,12 +4556,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[i + 1].qs, 0); - __m128i bx_2 = __lsx_vand_v(lowMask, tmp_2_3); + __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3); __m128i by_2 = __lsx_vld((const __m128i *)y[i + 1].qs, 0); bx_2 = __lsx_vsub_b(bx_2, off); const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); - __m128i bx_3 = __lsx_vand_v(lowMask, __lsx_vsrli_d(tmp_2_3, 4)); + __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4)); __m128i by_3 = __lsx_vld((const __m128i *)(y[i + 1].qs + 16), 0); bx_3 = __lsx_vsub_b(bx_3, off); const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); @@ -4896,7 +4861,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s); - const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); //FIXME + const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); const __m256 d1v = __lasx_xvreplfr2vr_s( d1 ); // Compute combined scales @@ -6839,7 +6804,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m256i p_2 = lasx_ext16_32(lasx_extracti128(p1, 0)); const __m256i p_3 = lasx_ext16_32(lasx_extracti128(p1, 1)); - FloatInt t0, t1, t2, t3; + ft_union t0, t1, t2, t3; t0.f = d * db[0]; t1.f = d * db[1]; t2.f = d * db[2]; @@ -7595,12 +7560,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r p16_0 = __lasx_xvadd_w(p16_0, p16_1); p16_2 = __lasx_xvadd_w(p16_2, p16_3); sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2)); - } - // multiply with block scale and accumulate acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);//FIXME - } *s = hsum_float_8(acc); @@ -8123,7 +8085,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r // multiply with block scale and accumulate acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(p16_0), acc); - } *s = hsum_float_8(acc); @@ -8668,21 +8629,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r __m256 vd = __lasx_xvreplfr2vr_s(d); acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); - } acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee)); __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0); acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); - - FloatInt fi; + ft_union fi; fi.i = __lsx_vpickve2gr_w(acc_m, 0); *s = hsum_float_8(acc) + fi.f ; #else - const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * mins = (const uint8_t*)&utmp[2]; @@ -9068,7 +9026,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m256i p32h = lasx_madd_h(__lasx_xvreplgr2vr_h(scales[1]), p16h); acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(p32h), acc); - } *s = hsum_float_8(acc) - summs; @@ -9668,12 +9625,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r p16_1 = lasx_madd_h(scale_1, p16_1); sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); - } __m256 vd = __lasx_xvreplfr2vr_s(d); acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc); - } *s = hsum_float_8(acc) + summs; @@ -10094,7 +10049,6 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const __m256i dot = __lasx_xvsub_w(__lasx_xvadd_w(p16_0, p16_1), __lasx_xvadd_w(s16_0, s16_1)); acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(dot), acc); - } *s = hsum_float_8(acc); @@ -10745,7 +10699,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1)); sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3)); - } acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc); @@ -11507,7 +11460,6 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void } accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } *s = 0.125f * hsum_float_8(accumf); @@ -11857,7 +11809,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * const __m256i sum = __lasx_xvadd_w(lasx_madd_h(sc1, dot1), lasx_madd_h(sc2, dot2)); accumf = __lasx_vfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sum), accumf); - } *s = 0.125f * hsum_float_8(accumf); @@ -12450,7 +12401,6 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * } accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } *s = 0.125f * hsum_float_8(accumf); @@ -12738,7 +12688,6 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void } accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } *s = 0.25f * hsum_float_8(accumf); @@ -13167,7 +13116,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * } accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf); - } *s = hsum_float_8(accumf); @@ -13462,7 +13410,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2); q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3); - __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0); q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1); q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2); @@ -13496,7 +13443,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum); accum1 += d * sumi1; - } *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;