cosmopolitan/third_party/intel/emmintrin.internal.h
2023-05-10 04:20:47 -07:00

1499 lines
49 KiB
C

#ifndef _EMMINTRIN_H_INCLUDED
#define _EMMINTRIN_H_INCLUDED
#ifdef __x86_64__
#include "third_party/intel/xmmintrin.internal.h"
#ifndef __SSE2__
#pragma GCC push_options
#pragma GCC target("sse2")
#define __DISABLE_SSE2__
#endif /* __SSE2__ */
typedef double __v2df __attribute__((__vector_size__(16)));
typedef long long __v2di __attribute__((__vector_size__(16)));
typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
typedef int __v4si __attribute__((__vector_size__(16)));
typedef unsigned int __v4su __attribute__((__vector_size__(16)));
typedef short __v8hi __attribute__((__vector_size__(16)));
typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
typedef char __v16qi __attribute__((__vector_size__(16)));
typedef signed char __v16qs __attribute__((__vector_size__(16)));
typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
typedef long long __m128i_u
__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
typedef double __m128d_u
__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F) {
return __extension__(__m128d){__F, 0.0};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F) {
return __extension__(__m128d){__F, __F};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F) {
return _mm_set1_pd(__F);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W, double __X) {
return __extension__(__m128d){__X, __W};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W, double __X) {
return __extension__(__m128d){__W, __X};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void) {
__m128d __Y = __Y;
return __Y;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void) {
return __extension__(__m128d){0.0, 0.0};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A, __m128d __B) {
return __extension__(__m128d)
__builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const *__P) {
return *(__m128d *)__P;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const *__P) {
return *(__m128d_u *)__P;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const *__P) {
return _mm_set1_pd(*__P);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const *__P) {
return _mm_set_sd(*__P);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const *__P) {
return _mm_load1_pd(__P);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const *__P) {
__m128d __tmp = _mm_load_pd(__P);
return __builtin_ia32_shufpd(__tmp, __tmp, _MM_SHUFFLE2(0, 1));
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double *__P, __m128d __A) {
*(__m128d *)__P = __A;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double *__P, __m128d __A) {
*(__m128d_u *)__P = __A;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double *__P, __m128d __A) {
*__P = ((__v2df)__A)[0];
}
extern __inline double
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A) {
return ((__v2df)__A)[0];
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double *__P, __m128d __A) {
_mm_store_sd(__P, __A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double *__P, __m128d __A) {
*__P = ((__v2df)__A)[1];
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double *__P, __m128d __A) {
_mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 0)));
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double *__P, __m128d __A) {
_mm_store1_pd(__P, __A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double *__P, __m128d __A) {
_mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 1)));
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A) {
return __builtin_ia32_vec_ext_v4si((__v4si)__A, 0);
}
#ifdef __x86_64__
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A) {
return ((__v2di)__A)[0];
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A) {
return ((__v2di)__A)[0];
}
#endif
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A + (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_addsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A - (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_subsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A * (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_mulsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A / (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_divsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A) {
return (__m128d)__builtin_ia32_sqrtpd((__v2df)__A);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A, __m128d __B) {
__v2df __tmp = __builtin_ia32_movsd((__v2df)__A, (__v2df)__B);
return (__m128d)__builtin_ia32_sqrtsd((__v2df)__tmp);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_minpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_minsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_maxpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_maxsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_andpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_andnpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_orpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_xorpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpltpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmplepd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpgtpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpgepd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpngtpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpngepd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpordpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpltsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmplesd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_movsd(
(__v2df)__A, (__v2df)__builtin_ia32_cmpltsd((__v2df)__B, (__v2df)__A));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_movsd(
(__v2df)__A, (__v2df)__builtin_ia32_cmplesd((__v2df)__B, (__v2df)__A));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_movsd(
(__v2df)__A, (__v2df)__builtin_ia32_cmpnltsd((__v2df)__B, (__v2df)__A));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_movsd(
(__v2df)__A, (__v2df)__builtin_ia32_cmpnlesd((__v2df)__B, (__v2df)__A));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpordsd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_comisdeq((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_comisdlt((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_comisdle((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_comisdgt((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_comisdge((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_comisdneq((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_ucomisdeq((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_ucomisdlt((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_ucomisdle((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_ucomisdgt((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_ucomisdge((__v2df)__A, (__v2df)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A, __m128d __B) {
return __builtin_ia32_ucomisdneq((__v2df)__A, (__v2df)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1, long long __q0) {
return __extension__(__m128i)(__v2di){__q0, __q1};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1, __m64 __q0) {
return _mm_set_epi64x((long long)__q1, (long long)__q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
short __q2, short __q1, short __q0) {
return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
__q4, __q5, __q6, __q7};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
char __q10, char __q09, char __q08, char __q07, char __q06,
char __q05, char __q04, char __q03, char __q02, char __q01,
char __q00) {
return __extension__(__m128i)(__v16qi){
__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
__q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A) {
return _mm_set_epi64x(__A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A) {
return _mm_set_epi64(__A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A) {
return _mm_set_epi32(__A, __A, __A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A) {
return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A) {
return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
__A, __A, __A, __A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0, __m64 __q1) {
return _mm_set_epi64(__q1, __q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
return _mm_set_epi32(__q3, __q2, __q1, __q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
short __q5, short __q6, short __q7) {
return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
char __q05, char __q06, char __q07, char __q08, char __q09,
char __q10, char __q11, char __q12, char __q13, char __q14,
char __q15) {
return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
__q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const *__P) {
return *__P;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const *__P) {
return *__P;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const *__P) {
return _mm_set_epi64((__m64)0LL, *(__m64_u *)__P);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si64(void const *__P) {
return _mm_loadl_epi64((__m128i_u *)__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i *__P, __m128i __B) {
*__P = __B;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u *__P, __m128i __B) {
*__P = __B;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u *__P, __m128i __B) {
*(__m64_u *)__P = (__m64)((__v2di)__B)[0];
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si64(void *__P, __m128i __B) {
_mm_storel_epi64((__m128i_u *)__P, __B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i __B) {
return (__m64)((__v2di)__B)[0];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A) {
return _mm_set_epi64((__m64)0LL, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A) {
return (__m128i)__builtin_ia32_movq128((__v2di)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void) {
__m128i __Y = __Y;
return __Y;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void) {
return __extension__(__m128i)(__v4si){0, 0, 0, 0};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A) {
return (__m128d)__builtin_ia32_cvtdq2pd((__v4si)__A);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A) {
return (__m128)__builtin_ia32_cvtdq2ps((__v4si)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A) {
return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A) {
return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__A);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A) {
return (__m128)__builtin_ia32_cvtpd2ps((__v2df)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A) {
return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A) {
return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__A);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A) {
return (__m128d)__builtin_ia32_cvtpi2pd((__v2si)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A) {
return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A) {
return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__A);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A) {
return (__m128d)__builtin_ia32_cvtps2pd((__v4sf)__A);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A) {
return __builtin_ia32_cvtsd2si((__v2df)__A);
}
#ifdef __x86_64__
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A) {
return __builtin_ia32_cvtsd2si64((__v2df)__A);
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A) {
return __builtin_ia32_cvtsd2si64((__v2df)__A);
}
#endif
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A) {
return __builtin_ia32_cvttsd2si((__v2df)__A);
}
#ifdef __x86_64__
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A) {
return __builtin_ia32_cvttsd2si64((__v2df)__A);
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A) {
return __builtin_ia32_cvttsd2si64((__v2df)__A);
}
#endif
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A, __m128d __B) {
return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A, int __B) {
return (__m128d)__builtin_ia32_cvtsi2sd((__v2df)__A, __B);
}
#ifdef __x86_64__
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A, long long __B) {
return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A, long long __B) {
return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B);
}
#endif
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A, __m128 __B) {
return (__m128d)__builtin_ia32_cvtss2sd((__v2df)__A, (__v4sf)__B);
}
#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
return (__m128d)__builtin_ia32_shufpd((__v2df)__A, (__v2df)__B, __mask);
}
#else
#define _mm_shuffle_pd(A, B, N) \
((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
(int)(N)))
#endif
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_unpckhpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A, __m128d __B) {
return (__m128d)__builtin_ia32_unpcklpd((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A, double const *__B) {
return (__m128d)__builtin_ia32_loadhpd((__v2df)__A, __B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A, double const *__B) {
return (__m128d)__builtin_ia32_loadlpd((__v2df)__A, __B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A) {
return __builtin_ia32_movmskpd((__v2df)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_packsswb128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_packssdw128((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_packuswb128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpckldq128((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A, __m128i __B) {
return (__m128i)((__v16qu)__A + (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hu)__A + (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A, __m128i __B) {
return (__m128i)((__v4su)__A + (__v4su)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A, __m128i __B) {
return (__m128i)((__v2du)__A + (__v2du)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_paddsb128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_paddsw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_paddusb128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_paddusw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A, __m128i __B) {
return (__m128i)((__v16qu)__A - (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hu)__A - (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A, __m128i __B) {
return (__m128i)((__v4su)__A - (__v4su)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A, __m128i __B) {
return (__m128i)((__v2du)__A - (__v2du)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psubsb128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psubsw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psubusb128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psubusw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hu)__A * (__v8hu)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pmuludq((__v2si)__A, (__v2si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pmuludq128((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_psllwi128((__v8hi)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_pslldi128((__v4si)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_psllqi128((__v2di)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_psrawi128((__v8hi)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_psradi128((__v4si)__A, __B);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A, const int __N) {
return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A, const int __N) {
return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A, const int __N) {
return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A, const int __N) {
return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8);
}
#else
#define _mm_bsrli_si128(A, N) \
((__m128i)__builtin_ia32_psrldqi128((__m128i)(A), (int)(N)*8))
#define _mm_bslli_si128(A, N) \
((__m128i)__builtin_ia32_pslldqi128((__m128i)(A), (int)(N)*8))
#define _mm_srli_si128(A, N) \
((__m128i)__builtin_ia32_psrldqi128((__m128i)(A), (int)(N)*8))
#define _mm_slli_si128(A, N) \
((__m128i)__builtin_ia32_pslldqi128((__m128i)(A), (int)(N)*8))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi16(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_psrldi128((__v4si)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A, int __B) {
return (__m128i)__builtin_ia32_psrlqi128((__v2di)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psraw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psrad128((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psrlw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psrld128((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psrlq128((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A, __m128i __B) {
return (__m128i)((__v2du)__A & (__v2du)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pandn128((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A, __m128i __B) {
return (__m128i)((__v2du)__A | (__v2du)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A, __m128i __B) {
return (__m128i)((__v2du)__A ^ (__v2du)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A, __m128i __B) {
return (__m128i)((__v16qs)__A == (__v16qs)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hi)__A == (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A, __m128i __B) {
return (__m128i)((__v4si)__A == (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A, __m128i __B) {
return (__m128i)((__v16qs)__A < (__v16qs)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hi)__A < (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A, __m128i __B) {
return (__m128i)((__v4si)__A < (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A, __m128i __B) {
return (__m128i)((__v16qs)__A > (__v16qs)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hi)__A > (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A, __m128i __B) {
return (__m128i)((__v4si)__A > (__v4si)__B);
}
#ifdef __OPTIMIZE__
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A, int const __N) {
return (unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)__A, __N);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)__A, __D, __N);
}
#else
#define _mm_extract_epi16(A, N) \
((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(A), \
(int)(N)))
#define _mm_insert_epi16(A, D, N) \
((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(A), (int)(D), \
(int)(N)))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pminsw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pminub128((__v16qi)__A, (__v16qi)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A) {
return __builtin_ia32_pmovmskb128((__v16qi)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__A, (__v8hi)__B);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A, const int __mask) {
return (__m128i)__builtin_ia32_pshufhw((__v8hi)__A, __mask);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A, const int __mask) {
return (__m128i)__builtin_ia32_pshuflw((__v8hi)__A, __mask);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A, const int __mask) {
return (__m128i)__builtin_ia32_pshufd((__v4si)__A, __mask);
}
#else
#define _mm_shufflehi_epi16(A, N) \
((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(A), (int)(N)))
#define _mm_shufflelo_epi16(A, N) \
((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(A), (int)(N)))
#define _mm_shuffle_epi32(A, N) \
((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(A), (int)(N)))
#endif
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
__builtin_ia32_maskmovdqu((__v16qi)__A, (__v16qi)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pavgb128((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_pavgw128((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_psadbw128((__v16qi)__A, (__v16qi)__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int *__A, int __B) {
__builtin_ia32_movnti(__A, __B);
}
#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int *__A, long long int __B) {
__builtin_ia32_movnti64(__A, __B);
}
#endif
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i *__A, __m128i __B) {
__builtin_ia32_movntdq((__v2di *)__A, (__v2di)__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double *__A, __m128d __B) {
__builtin_ia32_movntpd(__A, (__v2df)__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const *__A) {
__builtin_ia32_clflush(__A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void) {
__builtin_ia32_lfence();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void) {
__builtin_ia32_mfence();
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A) {
return _mm_set_epi32(0, 0, 0, __A);
}
#ifdef __x86_64__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A) {
return _mm_set_epi64x(0, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A) {
return _mm_set_epi64x(0, __A);
}
#endif
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A) {
return (__m128)__A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A) {
return (__m128i)__A;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A) {
return (__m128d)__A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A) {
return (__m128i)__A;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A) {
return (__m128)__A;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A) {
return (__m128d)__A;
}
#ifdef __DISABLE_SSE2__
#undef __DISABLE_SSE2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE2__ */
#endif /* __x86_64__ */
#endif /* _EMMINTRIN_H_INCLUDED */