mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 11:37:35 +00:00
965 lines
36 KiB
C
965 lines
36 KiB
C
/* clang-format off */
|
|
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
|
#ifndef _XMMINTRIN_H_INCLUDED
|
|
#define _XMMINTRIN_H_INCLUDED
|
|
#include "third_party/intel/mm_malloc.internal.h"
|
|
#include "third_party/intel/mmintrin.internal.h"
|
|
enum _mm_hint {
|
|
_MM_HINT_ET0 = 7,
|
|
_MM_HINT_ET1 = 6,
|
|
_MM_HINT_T0 = 3,
|
|
_MM_HINT_T1 = 2,
|
|
_MM_HINT_T2 = 1,
|
|
_MM_HINT_NTA = 0
|
|
};
|
|
#ifdef __OPTIMIZE__
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_prefetch(const void *__P, enum _mm_hint __I) {
|
|
__builtin_prefetch(__P, (__I & 0x4) >> 2, __I & 0x3);
|
|
}
|
|
#else
|
|
#define _mm_prefetch(P, I) __builtin_prefetch((P), ((I & 0x4) >> 2), (I & 0x3))
|
|
#endif
|
|
#ifndef __SSE__
|
|
#pragma GCC push_options
|
|
#pragma GCC target("sse")
|
|
#define __DISABLE_SSE__
|
|
#endif
|
|
typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
|
|
typedef float __m128_u
|
|
__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
|
|
typedef float __v4sf __attribute__((__vector_size__(16)));
|
|
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
|
|
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
|
|
#define _MM_EXCEPT_MASK 0x003f
|
|
#define _MM_EXCEPT_INVALID 0x0001
|
|
#define _MM_EXCEPT_DENORM 0x0002
|
|
#define _MM_EXCEPT_DIV_ZERO 0x0004
|
|
#define _MM_EXCEPT_OVERFLOW 0x0008
|
|
#define _MM_EXCEPT_UNDERFLOW 0x0010
|
|
#define _MM_EXCEPT_INEXACT 0x0020
|
|
#define _MM_MASK_MASK 0x1f80
|
|
#define _MM_MASK_INVALID 0x0080
|
|
#define _MM_MASK_DENORM 0x0100
|
|
#define _MM_MASK_DIV_ZERO 0x0200
|
|
#define _MM_MASK_OVERFLOW 0x0400
|
|
#define _MM_MASK_UNDERFLOW 0x0800
|
|
#define _MM_MASK_INEXACT 0x1000
|
|
#define _MM_ROUND_MASK 0x6000
|
|
#define _MM_ROUND_NEAREST 0x0000
|
|
#define _MM_ROUND_DOWN 0x2000
|
|
#define _MM_ROUND_UP 0x4000
|
|
#define _MM_ROUND_TOWARD_ZERO 0x6000
|
|
#define _MM_FLUSH_ZERO_MASK 0x8000
|
|
#define _MM_FLUSH_ZERO_ON 0x8000
|
|
#define _MM_FLUSH_ZERO_OFF 0x0000
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_undefined_ps(void) {
|
|
__m128 __Y = __Y;
|
|
return __Y;
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_setzero_ps(void) {
|
|
return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_add_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_addss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sub_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_subss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mul_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_mulss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_div_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_divss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sqrt_ss(__m128 __A) {
|
|
return (__m128)__builtin_ia32_sqrtss((__v4sf)__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rcp_ss(__m128 __A) {
|
|
return (__m128)__builtin_ia32_rcpss((__v4sf)__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rsqrt_ss(__m128 __A) {
|
|
return (__m128)__builtin_ia32_rsqrtss((__v4sf)__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_minss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_maxss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_add_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A + (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sub_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A - (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mul_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A * (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_div_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A / (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sqrt_ps(__m128 __A) {
|
|
return (__m128)__builtin_ia32_sqrtps((__v4sf)__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rcp_ps(__m128 __A) {
|
|
return (__m128)__builtin_ia32_rcpps((__v4sf)__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rsqrt_ps(__m128 __A) {
|
|
return (__m128)__builtin_ia32_rsqrtps((__v4sf)__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_minps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_maxps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_and_ps(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_andps(__A, __B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_andnot_ps(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_andnps(__A, __B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_or_ps(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_orps(__A, __B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_xor_ps(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_xorps(__A, __B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpeq_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpeqss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmplt_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpltss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmple_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpless((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpgt_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_movss(
|
|
(__v4sf)__A, (__v4sf)__builtin_ia32_cmpltss((__v4sf)__B, (__v4sf)__A));
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpge_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_movss(
|
|
(__v4sf)__A, (__v4sf)__builtin_ia32_cmpless((__v4sf)__B, (__v4sf)__A));
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpneq_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpneqss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnlt_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpnltss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnle_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpnless((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpngt_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_movss(
|
|
(__v4sf)__A, (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__B, (__v4sf)__A));
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnge_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_movss(
|
|
(__v4sf)__A, (__v4sf)__builtin_ia32_cmpnless((__v4sf)__B, (__v4sf)__A));
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpord_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpordss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpunord_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpunordss((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpeq_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpeqps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmplt_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpltps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmple_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpleps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpgt_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpgtps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpge_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpgeps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpneq_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpneqps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnlt_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpnltps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnle_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpnleps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpngt_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpngtps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnge_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpngeps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpord_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpordps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpunord_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_cmpunordps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comieq_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_comieq((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comilt_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_comilt((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comile_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_comile((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comigt_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_comigt((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comige_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_comige((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comineq_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_comineq((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomieq_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_ucomieq((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomilt_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_ucomilt((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomile_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_ucomile((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomigt_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_ucomigt((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomige_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_ucomige((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomineq_ss(__m128 __A, __m128 __B) {
|
|
return __builtin_ia32_ucomineq((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_si32(__m128 __A) {
|
|
return __builtin_ia32_cvtss2si((__v4sf)__A);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_ss2si(__m128 __A) {
|
|
return _mm_cvtss_si32(__A);
|
|
}
|
|
#ifdef __x86_64__
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_si64(__m128 __A) {
|
|
return __builtin_ia32_cvtss2si64((__v4sf)__A);
|
|
}
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_si64x(__m128 __A) {
|
|
return __builtin_ia32_cvtss2si64((__v4sf)__A);
|
|
}
|
|
#endif
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtps_pi32(__m128 __A) {
|
|
return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__A);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_ps2pi(__m128 __A) {
|
|
return _mm_cvtps_pi32(__A);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttss_si32(__m128 __A) {
|
|
return __builtin_ia32_cvttss2si((__v4sf)__A);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtt_ss2si(__m128 __A) {
|
|
return _mm_cvttss_si32(__A);
|
|
}
|
|
#ifdef __x86_64__
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttss_si64(__m128 __A) {
|
|
return __builtin_ia32_cvttss2si64((__v4sf)__A);
|
|
}
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttss_si64x(__m128 __A) {
|
|
return __builtin_ia32_cvttss2si64((__v4sf)__A);
|
|
}
|
|
#endif
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttps_pi32(__m128 __A) {
|
|
return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__A);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtt_ps2pi(__m128 __A) {
|
|
return _mm_cvttps_pi32(__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtsi32_ss(__m128 __A, int __B) {
|
|
return (__m128)__builtin_ia32_cvtsi2ss((__v4sf)__A, __B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_si2ss(__m128 __A, int __B) {
|
|
return _mm_cvtsi32_ss(__A, __B);
|
|
}
|
|
#ifdef __x86_64__
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtsi64_ss(__m128 __A, long long __B) {
|
|
return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtsi64x_ss(__m128 __A, long long __B) {
|
|
return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B);
|
|
}
|
|
#endif
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi32_ps(__m128 __A, __m64 __B) {
|
|
return (__m128)__builtin_ia32_cvtpi2ps((__v4sf)__A, (__v2si)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_pi2ps(__m128 __A, __m64 __B) {
|
|
return _mm_cvtpi32_ps(__A, __B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi16_ps(__m64 __A) {
|
|
__v4hi __sign;
|
|
__v2si __hisi, __losi;
|
|
__v4sf __zero, __ra, __rb;
|
|
__sign = __builtin_ia32_pcmpgtw((__v4hi)0LL, (__v4hi)__A);
|
|
__losi = (__v2si)__builtin_ia32_punpcklwd((__v4hi)__A, __sign);
|
|
__hisi = (__v2si)__builtin_ia32_punpckhwd((__v4hi)__A, __sign);
|
|
__zero = (__v4sf)_mm_setzero_ps();
|
|
__ra = __builtin_ia32_cvtpi2ps(__zero, __losi);
|
|
__rb = __builtin_ia32_cvtpi2ps(__ra, __hisi);
|
|
return (__m128)__builtin_ia32_movlhps(__ra, __rb);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpu16_ps(__m64 __A) {
|
|
__v2si __hisi, __losi;
|
|
__v4sf __zero, __ra, __rb;
|
|
__losi = (__v2si)__builtin_ia32_punpcklwd((__v4hi)__A, (__v4hi)0LL);
|
|
__hisi = (__v2si)__builtin_ia32_punpckhwd((__v4hi)__A, (__v4hi)0LL);
|
|
__zero = (__v4sf)_mm_setzero_ps();
|
|
__ra = __builtin_ia32_cvtpi2ps(__zero, __losi);
|
|
__rb = __builtin_ia32_cvtpi2ps(__ra, __hisi);
|
|
return (__m128)__builtin_ia32_movlhps(__ra, __rb);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi8_ps(__m64 __A) {
|
|
__v8qi __sign;
|
|
__sign = __builtin_ia32_pcmpgtb((__v8qi)0LL, (__v8qi)__A);
|
|
__A = (__m64)__builtin_ia32_punpcklbw((__v8qi)__A, __sign);
|
|
return _mm_cvtpi16_ps(__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpu8_ps(__m64 __A) {
|
|
__A = (__m64)__builtin_ia32_punpcklbw((__v8qi)__A, (__v8qi)0LL);
|
|
return _mm_cvtpu16_ps(__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
|
|
__v4sf __zero = (__v4sf)_mm_setzero_ps();
|
|
__v4sf __sfa = __builtin_ia32_cvtpi2ps(__zero, (__v2si)__A);
|
|
__v4sf __sfb = __builtin_ia32_cvtpi2ps(__sfa, (__v2si)__B);
|
|
return (__m128)__builtin_ia32_movlhps(__sfa, __sfb);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtps_pi16(__m128 __A) {
|
|
__v4sf __hisf = (__v4sf)__A;
|
|
__v4sf __losf = __builtin_ia32_movhlps(__hisf, __hisf);
|
|
__v2si __hisi = __builtin_ia32_cvtps2pi(__hisf);
|
|
__v2si __losi = __builtin_ia32_cvtps2pi(__losf);
|
|
return (__m64)__builtin_ia32_packssdw(__hisi, __losi);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtps_pi8(__m128 __A) {
|
|
__v4hi __tmp = (__v4hi)_mm_cvtps_pi16(__A);
|
|
return (__m64)__builtin_ia32_packsswb(__tmp, (__v4hi)0LL);
|
|
}
|
|
#ifdef __OPTIMIZE__
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
|
|
return (__m128)__builtin_ia32_shufps((__v4sf)__A, (__v4sf)__B, __mask);
|
|
}
|
|
#else
|
|
#define _mm_shuffle_ps(A, B, MASK) \
|
|
((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
|
|
(int)(MASK)))
|
|
#endif
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_unpackhi_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_unpckhps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_unpacklo_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_unpcklps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadh_pi(__m128 __A, __m64 const *__P) {
|
|
return (__m128)__builtin_ia32_loadhps((__v4sf)__A, (const __v2sf *)__P);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storeh_pi(__m64 *__P, __m128 __A) {
|
|
__builtin_ia32_storehps((__v2sf *)__P, (__v4sf)__A);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movehl_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_movhlps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movelh_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_ia32_movlhps((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadl_pi(__m128 __A, __m64 const *__P) {
|
|
return (__m128)__builtin_ia32_loadlps((__v4sf)__A, (const __v2sf *)__P);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storel_pi(__m64 *__P, __m128 __A) {
|
|
__builtin_ia32_storelps((__v2sf *)__P, (__v4sf)__A);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movemask_ps(__m128 __A) {
|
|
return __builtin_ia32_movmskps((__v4sf)__A);
|
|
}
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_getcsr(void) {
|
|
return __builtin_ia32_stmxcsr();
|
|
}
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_GET_EXCEPTION_STATE(void) {
|
|
return _mm_getcsr() & _MM_EXCEPT_MASK;
|
|
}
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_GET_EXCEPTION_MASK(void) {
|
|
return _mm_getcsr() & _MM_MASK_MASK;
|
|
}
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_GET_ROUNDING_MODE(void) {
|
|
return _mm_getcsr() & _MM_ROUND_MASK;
|
|
}
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_GET_FLUSH_ZERO_MODE(void) {
|
|
return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_setcsr(unsigned int __I) {
|
|
__builtin_ia32_ldmxcsr(__I);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_SET_EXCEPTION_STATE(unsigned int __mask) {
|
|
_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_SET_EXCEPTION_MASK(unsigned int __mask) {
|
|
_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_SET_ROUNDING_MODE(unsigned int __mode) {
|
|
_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_MM_SET_FLUSH_ZERO_MODE(unsigned int __mode) {
|
|
_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_set_ss(float __F) {
|
|
return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_set1_ps(float __F) {
|
|
return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_set_ps1(float __F) {
|
|
return _mm_set1_ps(__F);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load_ss(float const *__P) {
|
|
return _mm_set_ss(*__P);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load1_ps(float const *__P) {
|
|
return _mm_set1_ps(*__P);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load_ps1(float const *__P) {
|
|
return _mm_load1_ps(__P);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load_ps(float const *__P) {
|
|
return *(__m128 *)__P;
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadu_ps(float const *__P) {
|
|
return *(__m128_u *)__P;
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadr_ps(float const *__P) {
|
|
__v4sf __tmp = *(__v4sf *)__P;
|
|
return (__m128)__builtin_ia32_shufps(__tmp, __tmp, _MM_SHUFFLE(0, 1, 2, 3));
|
|
}
|
|
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
|
|
__artificial__))
|
|
_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
|
|
return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_setr_ps(float __Z, float __Y, float __X, float __W) {
|
|
return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store_ss(float *__P, __m128 __A) {
|
|
*__P = ((__v4sf)__A)[0];
|
|
}
|
|
extern __inline float
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_f32(__m128 __A) {
|
|
return ((__v4sf)__A)[0];
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store_ps(float *__P, __m128 __A) {
|
|
*(__m128 *)__P = __A;
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storeu_ps(float *__P, __m128 __A) {
|
|
*(__m128_u *)__P = __A;
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store1_ps(float *__P, __m128 __A) {
|
|
__v4sf __va = (__v4sf)__A;
|
|
__v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 0, 0, 0));
|
|
_mm_storeu_ps(__P, __tmp);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store_ps1(float *__P, __m128 __A) {
|
|
_mm_store1_ps(__P, __A);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storer_ps(float *__P, __m128 __A) {
|
|
__v4sf __va = (__v4sf)__A;
|
|
__v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 1, 2, 3));
|
|
_mm_store_ps(__P, __tmp);
|
|
}
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_move_ss(__m128 __A, __m128 __B) {
|
|
return (__m128)__builtin_shuffle(
|
|
(__v4sf)__A, (__v4sf)__B,
|
|
__extension__(__attribute__((__vector_size__(16))) int){4, 1, 2, 3});
|
|
}
|
|
#ifdef __OPTIMIZE__
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_extract_pi16(__m64 const __A, int const __N) {
|
|
return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__A, __N);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pextrw(__m64 const __A, int const __N) {
|
|
return _mm_extract_pi16(__A, __N);
|
|
}
|
|
#else
|
|
#define _mm_extract_pi16(A, N) \
|
|
((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)(__m64)(A), \
|
|
(int)(N)))
|
|
#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
|
|
#endif
|
|
#ifdef __OPTIMIZE__
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
|
|
return (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)__A, __D, __N);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pinsrw(__m64 const __A, int const __D, int const __N) {
|
|
return _mm_insert_pi16(__A, __D, __N);
|
|
}
|
|
#else
|
|
#define _mm_insert_pi16(A, D, N) \
|
|
((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)(__m64)(A), (int)(D), (int)(N)))
|
|
#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
|
|
#endif
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_pi16(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_pmaxsw((__v4hi)__A, (__v4hi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmaxsw(__m64 __A, __m64 __B) {
|
|
return _mm_max_pi16(__A, __B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_pu8(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_pmaxub((__v8qi)__A, (__v8qi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmaxub(__m64 __A, __m64 __B) {
|
|
return _mm_max_pu8(__A, __B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_pi16(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_pminsw((__v4hi)__A, (__v4hi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pminsw(__m64 __A, __m64 __B) {
|
|
return _mm_min_pi16(__A, __B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_pu8(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_pminub((__v8qi)__A, (__v8qi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pminub(__m64 __A, __m64 __B) {
|
|
return _mm_min_pu8(__A, __B);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movemask_pi8(__m64 __A) {
|
|
return __builtin_ia32_pmovmskb((__v8qi)__A);
|
|
}
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmovmskb(__m64 __A) {
|
|
return _mm_movemask_pi8(__A);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mulhi_pu16(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_pmulhuw((__v4hi)__A, (__v4hi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmulhuw(__m64 __A, __m64 __B) {
|
|
return _mm_mulhi_pu16(__A, __B);
|
|
}
|
|
#ifdef __OPTIMIZE__
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_shuffle_pi16(__m64 __A, int const __N) {
|
|
return (__m64)__builtin_ia32_pshufw((__v4hi)__A, __N);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pshufw(__m64 __A, int const __N) {
|
|
return _mm_shuffle_pi16(__A, __N);
|
|
}
|
|
#else
|
|
#define _mm_shuffle_pi16(A, N) \
|
|
((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(A), (int)(N)))
|
|
#define _m_pshufw(A, N) _mm_shuffle_pi16(A, N)
|
|
#endif
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
|
|
#ifdef __MMX_WITH_SSE__
|
|
typedef long long __v2di __attribute__((__vector_size__(16)));
|
|
typedef char __v16qi __attribute__((__vector_size__(16)));
|
|
__v2di __A128 = __extension__(__v2di){((__v1di)__A)[0], 0};
|
|
__v2di __N128 = __extension__(__v2di){((__v1di)__N)[0], 0};
|
|
__SIZE_TYPE__ offset = ((__SIZE_TYPE__)__P) & 0xf;
|
|
if (offset) {
|
|
if (offset > 8) offset = 8;
|
|
__P = (char *)(((__SIZE_TYPE__)__P) - offset);
|
|
switch (offset) {
|
|
case 1:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 8);
|
|
break;
|
|
case 2:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 2 * 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 2 * 8);
|
|
break;
|
|
case 3:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 3 * 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 3 * 8);
|
|
break;
|
|
case 4:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 4 * 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 4 * 8);
|
|
break;
|
|
case 5:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 5 * 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 5 * 8);
|
|
break;
|
|
case 6:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 6 * 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 6 * 8);
|
|
break;
|
|
case 7:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 7 * 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 7 * 8);
|
|
break;
|
|
case 8:
|
|
__A128 = __builtin_ia32_pslldqi128(__A128, 8 * 8);
|
|
__N128 = __builtin_ia32_pslldqi128(__N128, 8 * 8);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
__builtin_ia32_maskmovdqu((__v16qi)__A128, (__v16qi)__N128, __P);
|
|
#else
|
|
__builtin_ia32_maskmovq((__v8qi)__A, (__v8qi)__N, __P);
|
|
#endif
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_maskmovq(__m64 __A, __m64 __N, char *__P) {
|
|
_mm_maskmove_si64(__A, __N, __P);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_avg_pu8(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_pavgb((__v8qi)__A, (__v8qi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pavgb(__m64 __A, __m64 __B) {
|
|
return _mm_avg_pu8(__A, __B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_avg_pu16(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_pavgw((__v4hi)__A, (__v4hi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pavgw(__m64 __A, __m64 __B) {
|
|
return _mm_avg_pu16(__A, __B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sad_pu8(__m64 __A, __m64 __B) {
|
|
return (__m64)__builtin_ia32_psadbw((__v8qi)__A, (__v8qi)__B);
|
|
}
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_psadbw(__m64 __A, __m64 __B) {
|
|
return _mm_sad_pu8(__A, __B);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_stream_pi(__m64 *__P, __m64 __A) {
|
|
__builtin_ia32_movntq((unsigned long long *)__P, (unsigned long long)__A);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_stream_ps(float *__P, __m128 __A) {
|
|
__builtin_ia32_movntps(__P, (__v4sf)__A);
|
|
}
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sfence(void) {
|
|
__builtin_ia32_sfence();
|
|
}
|
|
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
|
|
do { \
|
|
__v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
|
|
__v4sf __t0 = __builtin_ia32_unpcklps(__r0, __r1); \
|
|
__v4sf __t1 = __builtin_ia32_unpcklps(__r2, __r3); \
|
|
__v4sf __t2 = __builtin_ia32_unpckhps(__r0, __r1); \
|
|
__v4sf __t3 = __builtin_ia32_unpckhps(__r2, __r3); \
|
|
(row0) = __builtin_ia32_movlhps(__t0, __t1); \
|
|
(row1) = __builtin_ia32_movhlps(__t1, __t0); \
|
|
(row2) = __builtin_ia32_movlhps(__t2, __t3); \
|
|
(row3) = __builtin_ia32_movhlps(__t3, __t2); \
|
|
} while (0)
|
|
#include "third_party/intel/emmintrin.internal.h"
|
|
#ifdef __DISABLE_SSE__
|
|
#undef __DISABLE_SSE__
|
|
#pragma GCC pop_options
|
|
#endif
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_pause(void) {
|
|
__builtin_ia32_pause();
|
|
}
|
|
#endif
|
|
#endif
|