added emmintrin and pmmintrin

This commit is contained in:
Gautham 2023-04-16 16:29:20 -05:00
parent 7c5c2e2b66
commit ca5a08cafc
2 changed files with 451 additions and 3 deletions

View file

@ -240,5 +240,425 @@ struct thatispacked mayalias __usi128ma {
#define _mm_pause() asm("rep nop")
#define _mm_set_sd(DBL_0) ((__m128d){(double)(DBL_0), 0.0})
#define _mm_set1_pd(DBL_0) ((__m128d){(double)(DBL_0), (double)(DBL_0)})
#define _mm_set_pd1(DBL_0) (_mm_set1_pd((double)(DBL_0)))
#define _mm_set_pd(DBL_0, DBL_1) ((__m128d){(double)(DBL_1), (double)(DBL_0)})
#define _mm_setr_pd(DBL_0, DBL_1) ((__m128d){(double)(DBL_0), (double)(DBL_1)})
#define _mm_undefined_pd() \
({ \
__m128d __Y = __Y; \
return __Y; \
})
#define _mm_setzero_pd() ((__m128d){0.0, 0.0})
#define _mm_move_sd(M128D_0, M128D_1) \
((__m128d)__builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1}))
#define _mm_load_pd(DBL_CONSTPTR_0) \
(*(__m128d *)(double const *)(DBL_CONSTPTR_0))
#define _mm_loadu_pd(DBL_CONSTPTR_0) \
(*(__m128d_u *)(double const *)(DBL_CONSTPTR_0))
#define _mm_load1_pd(DBL_CONSTPTR_0) \
(_mm_set1_pd(*(double const *)(DBL_CONSTPTR_0)))
#define _mm_load_sd(DBL_CONSTPTR_0) \
(_mm_set_sd(*(double const *)(DBL_CONSTPTR_0)))
#define _mm_load_pd1(DBL_CONSTPTR_0) \
(_mm_load1_pd((double const *)(DBL_CONSTPTR_0)))
#define _mm_loadr_pd(DBL_CONSTPTR_0) \
({ \
__m128d __tmp = _mm_load_pd((double const *)(DBL_CONSTPTR_0)); \
__builtin_ia32_shufpd(__tmp, __tmp, 1); \
})
#define _mm_store_pd(DBLPTR_0, M128D_0) \
(*(__m128d *)(double *)(DBLPTR_0) = (M128D_0))
#define _mm_storeu_pd(DBLPTR_0, M128D_0) \
(*(__m128d_u *)(double *)(DBLPTR_0) = (M128D_0))
#define _mm_store_sd(DBLPTR_0, M128D_0) \
(*(double *)(DBLPTR_0) = ((__v2df)(M128D_0))[0])
#define _mm_cvtsd_f64(M128D_0) (((__v2df)(M128D_0))[0])
#define _mm_storel_pd(DBLPTR_0, M128D_0) \
(_mm_store_sd((double *)(DBLPTR_0), (M128D_0)))
#define _mm_storeh_pd(DBLPTR_0, M128D_0) \
(*(double *)(DBLPTR_0) = ((__v2df)(M128D_0))[1])
#define _mm_store1_pd(DBLPTR_0, M128D_0) \
(_mm_store_pd((double *)(DBLPTR_0), \
__builtin_ia32_shufpd((M128D_0), (M128D_0), 0)))
#define _mm_store_pd1(DBLPTR_0, M128D_0) \
(_mm_store1_pd((double *)(DBLPTR_0), (M128D_0)))
#define _mm_storer_pd(DBLPTR_0, M128D_0) \
(_mm_store_pd((double *)(DBLPTR_0), \
__builtin_ia32_shufpd((M128D_0), (M128D_0), 1)))
#define _mm_comieq_sd(M128D_0, M128D_1) \
(__builtin_ia32_comisdeq((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_comilt_sd(M128D_0, M128D_1) \
(__builtin_ia32_comisdlt((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_comile_sd(M128D_0, M128D_1) \
(__builtin_ia32_comisdle((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_comigt_sd(M128D_0, M128D_1) \
(__builtin_ia32_comisdgt((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_comige_sd(M128D_0, M128D_1) \
(__builtin_ia32_comisdge((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_comineq_sd(M128D_0, M128D_1) \
(__builtin_ia32_comisdneq((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_ucomieq_sd(M128D_0, M128D_1) \
(__builtin_ia32_ucomisdeq((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_ucomilt_sd(M128D_0, M128D_1) \
(__builtin_ia32_ucomisdlt((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_ucomile_sd(M128D_0, M128D_1) \
(__builtin_ia32_ucomisdle((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_ucomigt_sd(M128D_0, M128D_1) \
(__builtin_ia32_ucomisdgt((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_ucomige_sd(M128D_0, M128D_1) \
(__builtin_ia32_ucomisdge((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_ucomineq_sd(M128D_0, M128D_1) \
(__builtin_ia32_ucomisdneq((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_cvtepi32_pd(M128I_0) ((__m128d)__builtin_ia32_cvtdq2pd((__v4si)__A))
#define _mm_cvtpd_epi32(M128D_0) \
((__m128i)__builtin_ia32_cvtpd2dq((__v2df)(M128D_0)))
#define _mm_cvtpd_pi32(M128D_0) \
((__m64)__builtin_ia32_cvtpd2pi((__v2df)(M128D_0)))
#define _mm_cvtpd_ps(M128D_0) \
((__m128)__builtin_ia32_cvtpd2ps((__v2df)(M128D_0)))
#define _mm_cvttpd_epi32(M128D_0) \
((__m128i)__builtin_ia32_cvttpd2dq((__v2df)(M128D_0)))
#define _mm_cvttpd_pi32(M128D_0) \
((__m64)__builtin_ia32_cvttpd2pi((__v2df)(M128D_0)))
#define _mm_cvtpi32_pd(M64_0) \
((__m128d)__builtin_ia32_cvtpi2pd((__v2si)(M64_0)))
#define _mm_cvtps_pd(M128_0) \
((__m128d)__builtin_ia32_cvtps2pd((__v4sf)(M128_0)))
#define _mm_cvtsd_si32(M128D_0) (__builtin_ia32_cvtsd2si((__v2df)(M128D_0)))
#define _mm_cvtsd_si64(M128D_0) (__builtin_ia32_cvtsd2si64((__v2df)(M128D_0)))
#define _mm_cvtsd_si64x(M128D_0) (__builtin_ia32_cvtsd2si64((__v2df)(M128D_0)))
#define _mm_cvttsd_si32(M128D_0) (__builtin_ia32_cvttsd2si((__v2df)(M128D_0)))
#define _mm_cvttsd_si64(M128D_0) (__builtin_ia32_cvttsd2si64((__v2df)(M128D_0)))
#define _mm_cvttsd_si64x(M128D_0) \
(__builtin_ia32_cvttsd2si64((__v2df)(M128D_0)))
#define _mm_cvtsd_ss(M128_0, M128D_1) \
((__m128)__builtin_ia32_cvtsd2ss((__v4sf)(M128_0), (__v2df)(M128D_1)))
#define _mm_cvtsi32_sd(M128D_0, INT_1) \
((__m128d)__builtin_ia32_cvtsi2sd((__v2df)(M128D_0), (INT_1)))
#define _mm_cvtsi64_sd(M128D_0, LL_1) \
((__m128d)__builtin_ia32_cvtsi642sd((__v2df)(M128D_0), (LL_1)))
#define _mm_cvtsi64x_sd(M128D_0, LL_1) \
((__m128d)__builtin_ia32_cvtsi642sd((__v2df)(M128D_0), (LL_1)))
#define _mm_cvtss_sd(M128D_0, M128_1) \
((__m128d)__builtin_ia32_cvtss2sd((__v2df)(M128D_0), (__v4sf)(M128_1)))
#define _mm_shuffle_pd(M128D_0, M128D_1, MSK) \
((__m128d)__builtin_ia32_shufpd((__v2df)(M128D_0), (__v2df)(M128D_1), (MSK)))
#define _mm_unpackhi_pd(M128D_0, M128D_1) \
((__m128d)__builtin_ia32_unpckhpd((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_unpacklo_pd(M128D_0, M128D_1) \
((__m128d)__builtin_ia32_unpcklpd((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_loadh_pd(M128D_0, DBL_CONSTPTR_1) \
((__m128d)__builtin_ia32_loadhpd((__v2df)(M128D_0), \
(double const *)(DBL_CONSTPTR_1)))
#define _mm_loadl_pd(M128D_0, DBL_CONSTPTR_1) \
((__m128d)__builtin_ia32_loadlpd((__v2df)(M128D_0), \
(double const *)(DBL_CONSTPTR_1)))
#define _mm_movemask_pd(M128D_0) (__builtin_ia32_movmskpd((__v2df)(M128D_0)))
#define _mm_stream_pd(DBLPTR_0, M128D_1) \
(__builtin_ia32_movntpd((double *)(DBLPTR_0), (__v2df)(M128D_1)))
#define _mm_castpd_ps(M128D_0) ((__m128)(M128D_0))
#define _mm_castpd_si128(M128D_0) ((__m128i)(M128D_0))
#define _mm_castps_pd(M128_0) ((__m128d)(M128_0))
#define _mm_cvtsi128_si64(M128I_0) (((__v2di)(M128I_0))[0])
#define _mm_cvtsi128_si64x(M128I_0) (((__v2di)(M128I_0))[0])
#define _mm_set_epi64(LL_1, LL_0) (_mm_set_epi64x((LL_1), (LL_0)))
#define _mm_set1_epi64(M64_0) (_mm_set_epi64((M64_0), (M64_0)))
#define _mm_setr_epi64(M64_0, M64_1) (_mm_set_epi64((M64_1), (M64_0)))
#define _mm_loadu_si128(PTR) (*(__m128i_u const *)(PTR))
#define _mm_loadl_epi64(PTR) \
(_mm_set_epi64((__m64)0LL, *(__m64_u *)(__m128i_u const *)(PTR)))
#define _mm_loadu_si64(PTR) (_mm_loadl_epi64((__m128i_u *)(PTR)))
#define _mm_store_si128(M128I_PTR_0, M128I_1) (*(M128I_PTR_0) = (M128I_1))
#define _mm_storeu_si128(PTR, M128I_1) (*(__m128i_u *)(PTR) = (M128I_1))
#define _mm_storel_epi64(PTR, M128I_1) \
(*(__m64_u *)(__m128i_u *)(PTR) = (__m64)((__v2di)(M128_I))[0])
#define _mm_storeu_si64(PTR, M128I_1) \
(_mm_storel_epi64((__m128i_u *)(PTR), M128I_1))
#define _mm_movepi64_pi64(M128I_1) ((__m64)((__v2di)(M128I_1))[0])
#define _mm_movpi64_epi64(M64_0) (_mm_set_epi64((__m64)0LL, (M64_0)))
#define _mm_move_epi64(M128I_0) \
((__m128i)__builtin_ia32_movq128((__v2di)(M128I_0)))
#define _mm_undefined_si128() ({ __m128i __Y = __Y; })
#define _mm_cvtepi32_ps(M128I_0) \
((__m128)__builtin_ia32_cvtdq2ps((__v4si)(M128I_0)))
#define _mm_cvtps_epi32(M128_0) \
((__m128i)__builtin_ia32_cvtps2dq((__v4sf)(M128_0)))
#define _mm_cvttps_epi32(M128_0) \
((__m128i)__builtin_ia32_cvttps2dq((__v4sf)(M128_0)))
#define _mm_packs_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_packsswb128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_packs_epi32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_packssdw128((__v4si)(M128I_0), (__v4si)(M128I_1)))
#define _mm_packus_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_packuswb128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_unpackhi_epi8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_punpckhbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_unpackhi_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_punpckhwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_unpackhi_epi32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_punpckhdq128((__v4si)(M128I_0), (__v4si)(M128I_1)))
#define _mm_unpacklo_epi8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_punpcklbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_unpacklo_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_punpcklwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_unpacklo_epi32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_punpckldq128((__v4si)(M128I_0), (__v4si)(M128I_1)))
#define _mm_add_epi8(M128I_0, M128I_1) \
((__m128i)((__v16qu)(M128I_0) + (__v16qu)(M128I_1)))
#define _mm_add_epi16(M128I_0, M128I_1) \
((__m128i)((__v8hu)(M128I_0) + (__v8hu)(M128I_1)))
#define _mm_adds_epi8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_paddsb128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_adds_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_paddsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_adds_epu8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_paddusb128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_adds_epu16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_paddusw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_sub_epi8(M128I_0, M128I_1) \
((__m128i)((__v16qu)(M128I_0) - (__v16qu)(M128I_1)))
#define _mm_sub_epi16(M128I_0, M128I_1) \
((__m128i)((__v8hu)(M128I_0) - (__v8hu)(M128I_1)))
#define _mm_sub_epi64(M128I_0, M128I_1) \
((__m128i)((__v2du)(M128I_0) - (__v2du)(M128I_1)))
#define _mm_mulhi_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pmulhw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_mullo_epi16(M128I_0, M128I_1) \
((__m128i)((__v8hu)(M128I_0) * (__v8hu)(M128I_1)))
#define _mm_mul_su32(M64_0, M64_1) \
((__m64)__builtin_ia32_pmuludq((__v2si)(M64_0), (__v2si)(M64_1)))
#define _mm_slli_epi16(M128I_0, INT_1) \
((__m128i)__builtin_ia32_psllwi128((__v8hi)(M128I_0), (INT_1)))
#define _mm_srai_epi16(M128I_0, INT_1) \
((__m128i)__builtin_ia32_psrawi128((__v8hi)(M128I_0), (INT_1)))
#define _mm_srai_epi32(M128I_0, INT_1) \
((__m128i)__builtin_ia32_psradi128((__v4si)(M128I_0), (INT_1)))
#define _mm_bsrli_si128(M128I_0, N) \
((__m128i)__builtin_ia32_psrldqi128((M128I_0), (N)*8))
#define _mm_bslli_si128(M128I_0, N) \
((__m128i)__builtin_ia32_pslldqi128((M128I_0), (N)*8))
#define _mm_srli_epi16(M128I_0, INT_1) \
((__m128i)__builtin_ia32_psrlwi128((__v8hi)(M128I_0), (INT_1)))
#define _mm_srli_epi32(M128I_0, INT_1) \
((__m128i)__builtin_ia32_psrldi128((__v4si)(M128I_0), (INT_1)))
#define _mm_sll_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psllw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_sll_epi32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pslld128((__v4si)(M128I_0), (__v4si)(M128I_1)))
#define _mm_sll_epi64(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psllq128((__v2di)(M128I_0), (__v2di)(M128I_1)))
#define _mm_sra_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psraw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_sra_epi32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psrad128((__v4si)(M128I_0), (__v4si)(M128I_1)))
#define _mm_srl_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psrlw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_srl_epi32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psrld128((__v4si)(M128I_0), (__v4si)(M128I_1)))
#define _mm_srl_epi64(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psrlq128((__v2di)(M128I_0), (__v2di)(M128I_1)))
#define _mm_cmpeq_epi16(M128I_0, M128I_1) \
((__m128i)((__v8hi)(M128I_0) == (__v8hi)(M128I_1)))
#define _mm_cmpeq_epi32(M128I_0, M128I_1) \
((__m128i)((__v4si)(M128I_0) == (__v4si)(M128I_1)))
#define _mm_cmplt_epi8(M128I_0, M128I_1) \
((__m128i)((__v16qs)(M128I_0) < (__v16qs)(M128I_1)))
#define _mm_cmplt_epi16(M128I_0, M128I_1) \
((__m128i)((__v8hi)(M128I_0) < (__v8hi)(M128I_1)))
#define _mm_cmplt_epi32(M128I_0, M128I_1) \
((__m128i)((__v4si)(M128I_0) < (__v4si)(M128I_1)))
#define _mm_cmpgt_epi8(M128I_0, M128I_1) \
((__m128i)((__v16qs)(M128I_0) > (__v16qs)(M128I_1)))
#define _mm_cmpgt_epi16(M128I_0, M128I_1) \
((__m128i)((__v8hi)(M128I_0) > (__v8hi)(M128I_1)))
#define _mm_cmpgt_epi32(M128I_0, M128I_1) \
((__m128i)((__v4si)(M128I_0) > (__v4si)(M128I_1)))
#define _mm_extract_epi16(M128I_CONST_0, INT_1) \
((unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(M128I_CONST_0), \
(INT_1)))
#define _mm_insert_epi16(M128I_CONST_0, INT_1, INT_2) \
((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(M128I_CONST_0), INT_1, INT_2))
#define _mm_max_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pmaxsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_max_epu8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pmaxub128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_min_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pminsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_min_epu8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pminub128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_mulhi_epu16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pmulhuw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_shufflehi_epi16(M128I_0, MSK) \
((__m128i)__builtin_ia32_pshufhw((__v8hi)(M128I_0), (MSK)))
#define _mm_shufflelo_epi16(M128I_0, MSK) \
((__m128i)__builtin_ia32_pshuflw((__v8hi)(M128I_0), (MSK)))
#define _mm_maskmoveu_si128(M128I_0, M128I_1, CHAR_PTR_2) \
(__builtin_ia32_maskmovdqu((__v16qi)(M128I_0), (__v16qi)(M128I_1), (char*)(CHAR_PTR_2))
#define _mm_avg_epu8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pavgb128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_avg_epu16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pavgw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_stream_si32(INT_PTR_0, INT_1) \
(__builtin_ia32_movnti((int *)(INT_PTR_0), (INT_1)))
#define _mm_stream_si64(LL_PTR_0, LL_1) \
(__builtin_ia32_movnti64((LL_PTR_0), (LL_1)))
#define _mm_stream_si128(M128I_PTR_0, M128I_1) \
(__builtin_ia32_movntdq((__v2di *)(M128I_PTR_0), (__v2di)(M128I_1)))
#define _mm_clflush(PTR) (__builtin_ia32_clflush((void const *)(PTR)))
#define _mm_lfence() (__builtin_ia32_lfence())
#define _mm_mfence() (__builtin_ia32_mfence())
#define _mm_cvtsi64_si128(LL_0) (_mm_set_epi64x(0, (LL_0)))
#define _mm_cvtsi64x_si128(LL_0) (_mm_set_epi64x(0, (LL_0)))
#define _mm_castsi128_pd(M128I_0) ((__m128d)(M128I_0))
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ */

View file

@ -1,14 +1,42 @@
#ifndef COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_
#include "libc/intrin/emmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse3
*/
#define _mm_addsub_ps(M128_0, M128_1) \
((__m128)__builtin_ia32_addsubps((__v4sf)(M128_0), (__v4sf)(M128_1)))
#define _mm_hadd_ps(M128_0, M128_1) \
((__m128)__builtin_ia32_haddps((__v4sf)(__m128)(M128_0), \
(__v4sf)(__m128)(M128_0)))
((__m128)__builtin_ia32_haddps((__v4sf)(M128_0), (__v4sf)(M128_1)))
#define _mm_hsub_ps(M128_0, M128_1) \
((__m128)__builtin_ia32_hsubps((__v4sf)(M128_0), (__v4sf)(M128_1)))
#define _mm_movehdup_ps(M128_0) \
((__m128)__builtin_ia32_movshdup((__v4sf)(M128_0)))
#define _mm_moveldup_ps(M128_0) \
((__m128)__builtin_ia32_movsldup((__v4sf)(M128_0)))
#define _mm_addsub_pd(M128D_0, M128D_1) \
((__m128d)__builtin_ia32_addsubpd((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_hadd_pd(M128D_0, M128D_1) \
((__m128d)__builtin_ia32_haddpd((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_hsub_pd(M128D_0, M128D_1) \
((__m128d)__builtin_ia32_hsubpd((__v2df)(M128D_0), (__v2df)(M128D_1)))
#define _mm_movedup_pd(M128D_0) (_mm_shuffle_pd((M128D_0), (M128D_0), 0))
#define _mm_loaddup_pd(CONSTDBL_PTR) (_mm_load1_pd((CONSTDBL_PTR)))
#define _mm_lddqu_si128(M128I_PTR0) \
((__m128i)__builtin_ia32_lddqu((char const *)(M128I_PTR0)))
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ */