#ifndef COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_
#include "libc/bits/xmmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)

/*───────────────────────────────────────────────────────────────────────────│─╗
│ cosmopolitan § it's a trap! » sse2                                       ─╬─│┼
╚────────────────────────────────────────────────────────────────────────────│*/

typedef char __v16qi _Vector_size(16);
typedef unsigned char __v16qu _Vector_size(16);
typedef signed char __v16qs _Vector_size(16);

typedef short __v8hi _Vector_size(16);
typedef unsigned short __v8hu _Vector_size(16);

typedef double __v2df _Vector_size(16);
typedef double __m128d _Vector_size(16) forcealign(16);
typedef double __m128d_u _Vector_size(16) forcealign(1);

typedef long long __v2di _Vector_size(16);
typedef long long __m128i _Vector_size(16) forcealign(16);
typedef long long __m128i_u _Vector_size(16) forcealign(1);
typedef unsigned long long __v2du _Vector_size(16);

struct thatispacked mayalias __usi128ma {
  __m128i_u __v;
};

/*───────────────────────────────────────────────────────────────────────────│─╗
│ cosmopolitan § it's a trap! » sse2 » memory ops                          ─╬─│┼
╚────────────────────────────────────────────────────────────────────────────│*/

#define _mm_loadu_si128(M128IP) ((struct __usi128ma *)(M128IP))->__v
#define _mm_storeu_si128(M128IP, M128I) \
  (((struct __usi128ma *)(M128IP))->__v = (M128I))

#define _mm_set_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8,  \
                     I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0)        \
  ((__m128i)(__v16qi){I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, \
                      I8_9, I8_10, I8_11, I8_12, I8_13, I8_14, I8_15})
#define _mm_set_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \
  ((__m128i)(__v8hi){I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7})
#define _mm_set_epi32(I32_3, I32_2, I32_1, I32_0) \
  ((__m128i)(__v4si){I32_0, I32_1, I32_2, I32_3})
#define _mm_set_epi64x(I64_1, I64_0) ((__m128i)(__v2di){I64_0, I64_1})

#define _mm_setr_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8, \
                      I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0)       \
  _mm_set_epi8(I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, I8_9,  \
               I8_10, I8_11, I8_12, I8_13, I8_14, I8_15)
#define _mm_setr_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \
  _mm_set_epi16(I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7)
#define _mm_setr_epi32(I32_3, I32_2, I32_1, I32_0) \
  _mm_set_epi32(I32_0, I32_1, I32_2, I32_3)
#define _mm_setr_epi64x(I64_1, I64_0) _mm_set_epi64x(I64_0, I64_1)

#define _mm_set1_epi8(I8) \
  _mm_set_epi8(I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8)
#define _mm_set1_epi16(I16) \
  _mm_set_epi16(I16, I16, I16, I16, I16, I16, I16, I16)
#define _mm_set1_epi32(I32)  _mm_set_epi32(I32, I32, I32, I32)
#define _mm_set1_epi64x(I64) _mm_set_epi64x(I64, I64)

#define _mm_cvtsi128_si32(M128I) ((__v4si)(M128I))[0]
#define _mm_cvtsi32_si128(I32)   ((__m128i)(__v4si){(I32), 0, 0, 0})
#define _mm_setzero_si128()      ((__m128i)(__v2di){0LL, 0LL})
#define _mm_castsi128_ps(M128I)  ((__m128)(M128I))
#define _mm_castps_si128(M128)   ((__m128i)(M128))
#define _mm_load_si128(M128I)    (*(M128I))

/*───────────────────────────────────────────────────────────────────────────│─╗
│ cosmopolitan § it's a trap! » sse2 » simd ops                            ─╬─│┼
╚────────────────────────────────────────────────────────────────────────────│*/

#define _mm_and_si128(M128I_0, M128I_1) \
  ((__m128i)((__v2du)(M128I_0) & (__v2du)(M128I_1)))
#define _mm_or_si128(M128I_0, M128I_1) \
  ((__m128i)((__v2du)(M128I_0) | (__v2du)(M128I_1)))
#define _mm_xor_si128(M128I_0, M128I_1) \
  ((__m128i)((__v2du)(M128I_0) ^ (__v2du)(M128I_1)))
#define _mm_andnot_si128(M128I_0, M128I_1) \
  ((__m128i)(~(__v2du)(M128I_0) & (__v2du)(M128I_1)))

#define _mm_add_pd(M128D_0, M128D_1) \
  (__m128d)((__v2df)(M128D_0) + (__v2df)(M128D_1))
#define _mm_sub_pd(M128D_0, M128D_1) \
  (__m128d)((__v2df)(M128D_0) - (__v2df)(M128D_1))
#define _mm_mul_pd(M128D_0, M128D_1) \
  (__m128d)((__v2df)(M128D_0) * (__v2df)(M128D_1))
#define _mm_div_pd(M128D_0, M128D_1) \
  (__m128d)((__v2df)(M128D_0) / (__v2df)(M128D_1))
#define _mm_and_pd(M128D_0, M128D_1) \
  (__m128d)((__v2df)(M128D_0) & (__v2df)(M128D_1))
#define _mm_or_pd(M128D_0, M128D_1) \
  (__m128d)((__v2df)(M128D_0) | (__v2df)(M128D_1))
#define _mm_xor_pd(M128D_0, M128D_1) \
  (__m128d)((__v2df)(M128D_0) ^ (__v2df)(M128D_1))
#define _mm_andnot_pd(M128D_0, M128D_1) \
  (__m128d)(~(__v2df)(M128D_0) & (__v2df)(M128D_1))
#define _mm_sqrt_pd(M128D) __builtin_ia32_sqrtpd((__v2df)(M128D))

#define _mm_min_pd(M128D_0, M128D_1) \
  __builtin_ia32_minpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_max_pd(M128D_0, M128D_1) \
  __builtin_ia32_maxpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpeq_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpeqpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpneq_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpneqpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmplt_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpltpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnlt_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnltpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmple_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmplepd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnle_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnlepd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpgt_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpltpd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpngt_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnltpd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpge_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmplepd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpnge_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnlepd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpord_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpordpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpunord_pd(M128D_0, M128D_1) \
  __builtin_ia32_cmpunordpd((__v2df)(M128D_0), (__v2df)(M128D_1))

#define _mm_sad_epu8(M128I_0, M128I_1) \
  __builtin_ia32_psadbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1))

#define _mm_subs_epi8(M128I_0, M128I_1) \
  ((__m128i)__builtin_ia32_psubsb128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_subs_epu8(M128I_0, M128I_1) \
  ((__m128i)__builtin_ia32_psubusw128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_subs_epi16(M128I_0, M128I_1) \
  ((__m128i)__builtin_ia32_psubsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_subs_epu16(M128I_0, M128I_1) \
  ((__m128i)__builtin_ia32_psubusw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))

#define _mm_add_epi32(M128I_0, M128I_1) \
  ((__m128i)((__v4su)(M128I_0) + (__v4su)(M128I_1)))
#define _mm_sub_epi32(M128I_0, M128I_1) \
  ((__m128i)((__v4su)(M128I_0) - (__v4su)(M128I_1)))
#define _mm_madd_epi16(M128I_0, M128I_1) \
  ((__m128i)__builtin_ia32_pmaddwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_shuffle_epi32(V, IMM) \
  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(V), (int)(IMM)))

#define _mm_slli_epi32(M128I, COUNT) \
  ((__m128i)__builtin_ia32_pslldi128((__v4si)(M128I), (COUNT)))

#define _mm_slli_si128(M128I, IMM) \
  ((__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8))
#define _mm_srli_si128(M128I, IMM) \
  ((__m128i)__builtin_ia32_psrldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8))

/*───────────────────────────────────────────────────────────────────────────│─╗
│ cosmopolitan § it's a trap! » sse2 » scalar ops                          ─╬─│┼
╚────────────────────────────────────────────────────────────────────────────│*/

#define _mm_sqrt_sd(M128D_0, M128D_1)                          \
  ({                                                           \
    __m128d M128d2 = __builtin_ia32_sqrtsd((__v2df)(M128D_1)); \
    (__m128d){M128d2[0], (M128D_0)[1]};                        \
  })

#define _mm_add_sd(M128D_0, M128D_1) \
  ({                                 \
    (M128D_0)[0] += (M128D_1)[0];    \
    (M128D_0);                       \
  })
#define _mm_sub_sd(M128D_0, M128D_1) \
  ({                                 \
    (M128D_0)[0] -= (M128D_1)[0];    \
    (M128D_0);                       \
  })
#define _mm_mul_sd(M128D_0, M128D_1) \
  ({                                 \
    (M128D_0)[0] *= (M128D_1)[0];    \
    (M128D_0);                       \
  })
#define _mm_div_sd(M128D_0, M128D_1) \
  ({                                 \
    (M128D_0)[0] /= (M128D_1)[0];    \
    (M128D_0);                       \
  })

#define _mm_min_sd(M128D_0, M128D_1) \
  __builtin_ia32_minsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_max_sd(M128D_0, M128D_1) \
  __builtin_ia32_maxsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpeq_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpeqsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpneq_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpneqsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmplt_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpltsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnlt_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnltsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmple_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmplesd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnle_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnlesd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpgt_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpltsd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpngt_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnltsd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpge_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmplesd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpnge_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpnlesd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpord_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpordsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpunord_sd(M128D_0, M128D_1) \
  __builtin_ia32_cmpunordsd((__v2df)(M128D_0), (__v2df)(M128D_1))

#define _mm_SSE2(op, A, B)	\
  ({				\
      __m128i R = A;		\
      asm(#op " %1, %0"		\
          : "+x"(R) : "xm"(B));	\
      R;			\
  })
#define _mm_mul_epu32(A, B)		_mm_SSE2(pmuludq, A, B)
#define _mm_add_epi64(A, B)		_mm_SSE2(paddq, A, B)
#define _mm_srli_epi64(A, B)		_mm_SSE2(psrlq, A, B)
#define _mm_slli_epi64(A, B)		_mm_SSE2(psllq, A, B)
#define _mm_unpacklo_epi64(A, B)	_mm_SSE2(punpcklqdq, A, B)
#define _mm_unpackhi_epi64(A, B)	_mm_SSE2(punpckhqdq, A, B)

/*───────────────────────────────────────────────────────────────────────────│─╗
│ cosmopolitan § it's a trap! » sse2 » miscellaneous                       ─╬─│┼
╚────────────────────────────────────────────────────────────────────────────│*/

#define _mm_pause() asm("rep nop")

#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ */