From ca5a08cafc20f6a0f3df01d1673abac942a598f5 Mon Sep 17 00:00:00 2001 From: Gautham Date: Sun, 16 Apr 2023 16:29:20 -0500 Subject: [PATCH] added emmintrin and pmmintrin --- libc/intrin/emmintrin.internal.h | 420 +++++++++++++++++++++++++++++++ libc/intrin/pmmintrin.internal.h | 34 ++- 2 files changed, 451 insertions(+), 3 deletions(-) diff --git a/libc/intrin/emmintrin.internal.h b/libc/intrin/emmintrin.internal.h index 75e25b045..756ef6c8a 100644 --- a/libc/intrin/emmintrin.internal.h +++ b/libc/intrin/emmintrin.internal.h @@ -240,5 +240,425 @@ struct thatispacked mayalias __usi128ma { #define _mm_pause() asm("rep nop") +#define _mm_set_sd(DBL_0) ((__m128d){(double)(DBL_0), 0.0}) + +#define _mm_set1_pd(DBL_0) ((__m128d){(double)(DBL_0), (double)(DBL_0)}) + +#define _mm_set_pd1(DBL_0) (_mm_set1_pd((double)(DBL_0))) + +#define _mm_set_pd(DBL_0, DBL_1) ((__m128d){(double)(DBL_1), (double)(DBL_0)}) + +#define _mm_setr_pd(DBL_0, DBL_1) ((__m128d){(double)(DBL_0), (double)(DBL_1)}) + +#define _mm_undefined_pd() \ + ({ \ + __m128d __Y = __Y; \ + return __Y; \ + }) + +#define _mm_setzero_pd() ((__m128d){0.0, 0.0}) + +#define _mm_move_sd(M128D_0, M128D_1) \ + ((__m128d)__builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1})) + +#define _mm_load_pd(DBL_CONSTPTR_0) \ + (*(__m128d *)(double const *)(DBL_CONSTPTR_0)) + +#define _mm_loadu_pd(DBL_CONSTPTR_0) \ + (*(__m128d_u *)(double const *)(DBL_CONSTPTR_0)) + +#define _mm_load1_pd(DBL_CONSTPTR_0) \ + (_mm_set1_pd(*(double const *)(DBL_CONSTPTR_0))) + +#define _mm_load_sd(DBL_CONSTPTR_0) \ + (_mm_set_sd(*(double const *)(DBL_CONSTPTR_0))) + +#define _mm_load_pd1(DBL_CONSTPTR_0) \ + (_mm_load1_pd((double const *)(DBL_CONSTPTR_0))) + +#define _mm_loadr_pd(DBL_CONSTPTR_0) \ + ({ \ + __m128d __tmp = _mm_load_pd((double const *)(DBL_CONSTPTR_0)); \ + __builtin_ia32_shufpd(__tmp, __tmp, 1); \ + }) + +#define _mm_store_pd(DBLPTR_0, M128D_0) \ + (*(__m128d *)(double *)(DBLPTR_0) = (M128D_0)) + +#define _mm_storeu_pd(DBLPTR_0, M128D_0) \ + (*(__m128d_u *)(double *)(DBLPTR_0) = (M128D_0)) + +#define _mm_store_sd(DBLPTR_0, M128D_0) \ + (*(double *)(DBLPTR_0) = ((__v2df)(M128D_0))[0]) + +#define _mm_cvtsd_f64(M128D_0) (((__v2df)(M128D_0))[0]) + +#define _mm_storel_pd(DBLPTR_0, M128D_0) \ + (_mm_store_sd((double *)(DBLPTR_0), (M128D_0))) + +#define _mm_storeh_pd(DBLPTR_0, M128D_0) \ + (*(double *)(DBLPTR_0) = ((__v2df)(M128D_0))[1]) + +#define _mm_store1_pd(DBLPTR_0, M128D_0) \ + (_mm_store_pd((double *)(DBLPTR_0), \ + __builtin_ia32_shufpd((M128D_0), (M128D_0), 0))) + +#define _mm_store_pd1(DBLPTR_0, M128D_0) \ + (_mm_store1_pd((double *)(DBLPTR_0), (M128D_0))) + +#define _mm_storer_pd(DBLPTR_0, M128D_0) \ + (_mm_store_pd((double *)(DBLPTR_0), \ + __builtin_ia32_shufpd((M128D_0), (M128D_0), 1))) + +#define _mm_comieq_sd(M128D_0, M128D_1) \ + (__builtin_ia32_comisdeq((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_comilt_sd(M128D_0, M128D_1) \ + (__builtin_ia32_comisdlt((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_comile_sd(M128D_0, M128D_1) \ + (__builtin_ia32_comisdle((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_comigt_sd(M128D_0, M128D_1) \ + (__builtin_ia32_comisdgt((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_comige_sd(M128D_0, M128D_1) \ + (__builtin_ia32_comisdge((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_comineq_sd(M128D_0, M128D_1) \ + (__builtin_ia32_comisdneq((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_ucomieq_sd(M128D_0, M128D_1) \ + (__builtin_ia32_ucomisdeq((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_ucomilt_sd(M128D_0, M128D_1) \ + (__builtin_ia32_ucomisdlt((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_ucomile_sd(M128D_0, M128D_1) \ + (__builtin_ia32_ucomisdle((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_ucomigt_sd(M128D_0, M128D_1) \ + (__builtin_ia32_ucomisdgt((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_ucomige_sd(M128D_0, M128D_1) \ + (__builtin_ia32_ucomisdge((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_ucomineq_sd(M128D_0, M128D_1) \ + (__builtin_ia32_ucomisdneq((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_cvtepi32_pd(M128I_0) ((__m128d)__builtin_ia32_cvtdq2pd((__v4si)__A)) + +#define _mm_cvtpd_epi32(M128D_0) \ + ((__m128i)__builtin_ia32_cvtpd2dq((__v2df)(M128D_0))) + +#define _mm_cvtpd_pi32(M128D_0) \ + ((__m64)__builtin_ia32_cvtpd2pi((__v2df)(M128D_0))) + +#define _mm_cvtpd_ps(M128D_0) \ + ((__m128)__builtin_ia32_cvtpd2ps((__v2df)(M128D_0))) + +#define _mm_cvttpd_epi32(M128D_0) \ + ((__m128i)__builtin_ia32_cvttpd2dq((__v2df)(M128D_0))) + +#define _mm_cvttpd_pi32(M128D_0) \ + ((__m64)__builtin_ia32_cvttpd2pi((__v2df)(M128D_0))) + +#define _mm_cvtpi32_pd(M64_0) \ + ((__m128d)__builtin_ia32_cvtpi2pd((__v2si)(M64_0))) + +#define _mm_cvtps_pd(M128_0) \ + ((__m128d)__builtin_ia32_cvtps2pd((__v4sf)(M128_0))) + +#define _mm_cvtsd_si32(M128D_0) (__builtin_ia32_cvtsd2si((__v2df)(M128D_0))) + +#define _mm_cvtsd_si64(M128D_0) (__builtin_ia32_cvtsd2si64((__v2df)(M128D_0))) + +#define _mm_cvtsd_si64x(M128D_0) (__builtin_ia32_cvtsd2si64((__v2df)(M128D_0))) + +#define _mm_cvttsd_si32(M128D_0) (__builtin_ia32_cvttsd2si((__v2df)(M128D_0))) + +#define _mm_cvttsd_si64(M128D_0) (__builtin_ia32_cvttsd2si64((__v2df)(M128D_0))) + +#define _mm_cvttsd_si64x(M128D_0) \ + (__builtin_ia32_cvttsd2si64((__v2df)(M128D_0))) + +#define _mm_cvtsd_ss(M128_0, M128D_1) \ + ((__m128)__builtin_ia32_cvtsd2ss((__v4sf)(M128_0), (__v2df)(M128D_1))) + +#define _mm_cvtsi32_sd(M128D_0, INT_1) \ + ((__m128d)__builtin_ia32_cvtsi2sd((__v2df)(M128D_0), (INT_1))) + +#define _mm_cvtsi64_sd(M128D_0, LL_1) \ + ((__m128d)__builtin_ia32_cvtsi642sd((__v2df)(M128D_0), (LL_1))) + +#define _mm_cvtsi64x_sd(M128D_0, LL_1) \ + ((__m128d)__builtin_ia32_cvtsi642sd((__v2df)(M128D_0), (LL_1))) + +#define _mm_cvtss_sd(M128D_0, M128_1) \ + ((__m128d)__builtin_ia32_cvtss2sd((__v2df)(M128D_0), (__v4sf)(M128_1))) + +#define _mm_shuffle_pd(M128D_0, M128D_1, MSK) \ + ((__m128d)__builtin_ia32_shufpd((__v2df)(M128D_0), (__v2df)(M128D_1), (MSK))) + +#define _mm_unpackhi_pd(M128D_0, M128D_1) \ + ((__m128d)__builtin_ia32_unpckhpd((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_unpacklo_pd(M128D_0, M128D_1) \ + ((__m128d)__builtin_ia32_unpcklpd((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_loadh_pd(M128D_0, DBL_CONSTPTR_1) \ + ((__m128d)__builtin_ia32_loadhpd((__v2df)(M128D_0), \ + (double const *)(DBL_CONSTPTR_1))) + +#define _mm_loadl_pd(M128D_0, DBL_CONSTPTR_1) \ + ((__m128d)__builtin_ia32_loadlpd((__v2df)(M128D_0), \ + (double const *)(DBL_CONSTPTR_1))) + +#define _mm_movemask_pd(M128D_0) (__builtin_ia32_movmskpd((__v2df)(M128D_0))) + +#define _mm_stream_pd(DBLPTR_0, M128D_1) \ + (__builtin_ia32_movntpd((double *)(DBLPTR_0), (__v2df)(M128D_1))) + +#define _mm_castpd_ps(M128D_0) ((__m128)(M128D_0)) + +#define _mm_castpd_si128(M128D_0) ((__m128i)(M128D_0)) + +#define _mm_castps_pd(M128_0) ((__m128d)(M128_0)) + +#define _mm_cvtsi128_si64(M128I_0) (((__v2di)(M128I_0))[0]) + +#define _mm_cvtsi128_si64x(M128I_0) (((__v2di)(M128I_0))[0]) + +#define _mm_set_epi64(LL_1, LL_0) (_mm_set_epi64x((LL_1), (LL_0))) + +#define _mm_set1_epi64(M64_0) (_mm_set_epi64((M64_0), (M64_0))) + +#define _mm_setr_epi64(M64_0, M64_1) (_mm_set_epi64((M64_1), (M64_0))) + +#define _mm_loadu_si128(PTR) (*(__m128i_u const *)(PTR)) + +#define _mm_loadl_epi64(PTR) \ + (_mm_set_epi64((__m64)0LL, *(__m64_u *)(__m128i_u const *)(PTR))) + +#define _mm_loadu_si64(PTR) (_mm_loadl_epi64((__m128i_u *)(PTR))) + +#define _mm_store_si128(M128I_PTR_0, M128I_1) (*(M128I_PTR_0) = (M128I_1)) + +#define _mm_storeu_si128(PTR, M128I_1) (*(__m128i_u *)(PTR) = (M128I_1)) + +#define _mm_storel_epi64(PTR, M128I_1) \ + (*(__m64_u *)(__m128i_u *)(PTR) = (__m64)((__v2di)(M128_I))[0]) + +#define _mm_storeu_si64(PTR, M128I_1) \ + (_mm_storel_epi64((__m128i_u *)(PTR), M128I_1)) + +#define _mm_movepi64_pi64(M128I_1) ((__m64)((__v2di)(M128I_1))[0]) + +#define _mm_movpi64_epi64(M64_0) (_mm_set_epi64((__m64)0LL, (M64_0))) + +#define _mm_move_epi64(M128I_0) \ + ((__m128i)__builtin_ia32_movq128((__v2di)(M128I_0))) + +#define _mm_undefined_si128() ({ __m128i __Y = __Y; }) + +#define _mm_cvtepi32_ps(M128I_0) \ + ((__m128)__builtin_ia32_cvtdq2ps((__v4si)(M128I_0))) + +#define _mm_cvtps_epi32(M128_0) \ + ((__m128i)__builtin_ia32_cvtps2dq((__v4sf)(M128_0))) + +#define _mm_cvttps_epi32(M128_0) \ + ((__m128i)__builtin_ia32_cvttps2dq((__v4sf)(M128_0))) + +#define _mm_packs_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_packsswb128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_packs_epi32(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_packssdw128((__v4si)(M128I_0), (__v4si)(M128I_1))) + +#define _mm_packus_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_packuswb128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_unpackhi_epi8(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_punpckhbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) + +#define _mm_unpackhi_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_punpckhwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_unpackhi_epi32(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_punpckhdq128((__v4si)(M128I_0), (__v4si)(M128I_1))) + +#define _mm_unpacklo_epi8(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_punpcklbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) + +#define _mm_unpacklo_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_punpcklwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_unpacklo_epi32(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_punpckldq128((__v4si)(M128I_0), (__v4si)(M128I_1))) + +#define _mm_add_epi8(M128I_0, M128I_1) \ + ((__m128i)((__v16qu)(M128I_0) + (__v16qu)(M128I_1))) + +#define _mm_add_epi16(M128I_0, M128I_1) \ + ((__m128i)((__v8hu)(M128I_0) + (__v8hu)(M128I_1))) + +#define _mm_adds_epi8(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_paddsb128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) + +#define _mm_adds_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_paddsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_adds_epu8(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_paddusb128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) + +#define _mm_adds_epu16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_paddusw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_sub_epi8(M128I_0, M128I_1) \ + ((__m128i)((__v16qu)(M128I_0) - (__v16qu)(M128I_1))) + +#define _mm_sub_epi16(M128I_0, M128I_1) \ + ((__m128i)((__v8hu)(M128I_0) - (__v8hu)(M128I_1))) + +#define _mm_sub_epi64(M128I_0, M128I_1) \ + ((__m128i)((__v2du)(M128I_0) - (__v2du)(M128I_1))) + +#define _mm_mulhi_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pmulhw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_mullo_epi16(M128I_0, M128I_1) \ + ((__m128i)((__v8hu)(M128I_0) * (__v8hu)(M128I_1))) + +#define _mm_mul_su32(M64_0, M64_1) \ + ((__m64)__builtin_ia32_pmuludq((__v2si)(M64_0), (__v2si)(M64_1))) + +#define _mm_slli_epi16(M128I_0, INT_1) \ + ((__m128i)__builtin_ia32_psllwi128((__v8hi)(M128I_0), (INT_1))) + +#define _mm_srai_epi16(M128I_0, INT_1) \ + ((__m128i)__builtin_ia32_psrawi128((__v8hi)(M128I_0), (INT_1))) + +#define _mm_srai_epi32(M128I_0, INT_1) \ + ((__m128i)__builtin_ia32_psradi128((__v4si)(M128I_0), (INT_1))) + +#define _mm_bsrli_si128(M128I_0, N) \ + ((__m128i)__builtin_ia32_psrldqi128((M128I_0), (N)*8)) + +#define _mm_bslli_si128(M128I_0, N) \ + ((__m128i)__builtin_ia32_pslldqi128((M128I_0), (N)*8)) + +#define _mm_srli_epi16(M128I_0, INT_1) \ + ((__m128i)__builtin_ia32_psrlwi128((__v8hi)(M128I_0), (INT_1))) + +#define _mm_srli_epi32(M128I_0, INT_1) \ + ((__m128i)__builtin_ia32_psrldi128((__v4si)(M128I_0), (INT_1))) + +#define _mm_sll_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_psllw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_sll_epi32(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pslld128((__v4si)(M128I_0), (__v4si)(M128I_1))) + +#define _mm_sll_epi64(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_psllq128((__v2di)(M128I_0), (__v2di)(M128I_1))) + +#define _mm_sra_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_psraw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_sra_epi32(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_psrad128((__v4si)(M128I_0), (__v4si)(M128I_1))) + +#define _mm_srl_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_psrlw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_srl_epi32(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_psrld128((__v4si)(M128I_0), (__v4si)(M128I_1))) + +#define _mm_srl_epi64(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_psrlq128((__v2di)(M128I_0), (__v2di)(M128I_1))) + +#define _mm_cmpeq_epi16(M128I_0, M128I_1) \ + ((__m128i)((__v8hi)(M128I_0) == (__v8hi)(M128I_1))) + +#define _mm_cmpeq_epi32(M128I_0, M128I_1) \ + ((__m128i)((__v4si)(M128I_0) == (__v4si)(M128I_1))) + +#define _mm_cmplt_epi8(M128I_0, M128I_1) \ + ((__m128i)((__v16qs)(M128I_0) < (__v16qs)(M128I_1))) + +#define _mm_cmplt_epi16(M128I_0, M128I_1) \ + ((__m128i)((__v8hi)(M128I_0) < (__v8hi)(M128I_1))) + +#define _mm_cmplt_epi32(M128I_0, M128I_1) \ + ((__m128i)((__v4si)(M128I_0) < (__v4si)(M128I_1))) + +#define _mm_cmpgt_epi8(M128I_0, M128I_1) \ + ((__m128i)((__v16qs)(M128I_0) > (__v16qs)(M128I_1))) + +#define _mm_cmpgt_epi16(M128I_0, M128I_1) \ + ((__m128i)((__v8hi)(M128I_0) > (__v8hi)(M128I_1))) + +#define _mm_cmpgt_epi32(M128I_0, M128I_1) \ + ((__m128i)((__v4si)(M128I_0) > (__v4si)(M128I_1))) + +#define _mm_extract_epi16(M128I_CONST_0, INT_1) \ + ((unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(M128I_CONST_0), \ + (INT_1))) + +#define _mm_insert_epi16(M128I_CONST_0, INT_1, INT_2) \ + ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(M128I_CONST_0), INT_1, INT_2)) + +#define _mm_max_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pmaxsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_max_epu8(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pmaxub128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) + +#define _mm_min_epi16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pminsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_min_epu8(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pminub128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) + +#define _mm_mulhi_epu16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pmulhuw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_shufflehi_epi16(M128I_0, MSK) \ + ((__m128i)__builtin_ia32_pshufhw((__v8hi)(M128I_0), (MSK))) + +#define _mm_shufflelo_epi16(M128I_0, MSK) \ + ((__m128i)__builtin_ia32_pshuflw((__v8hi)(M128I_0), (MSK))) + +#define _mm_maskmoveu_si128(M128I_0, M128I_1, CHAR_PTR_2) \ + (__builtin_ia32_maskmovdqu((__v16qi)(M128I_0), (__v16qi)(M128I_1), (char*)(CHAR_PTR_2)) + +#define _mm_avg_epu8(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pavgb128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) + +#define _mm_avg_epu16(M128I_0, M128I_1) \ + ((__m128i)__builtin_ia32_pavgw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) + +#define _mm_stream_si32(INT_PTR_0, INT_1) \ + (__builtin_ia32_movnti((int *)(INT_PTR_0), (INT_1))) + +#define _mm_stream_si64(LL_PTR_0, LL_1) \ + (__builtin_ia32_movnti64((LL_PTR_0), (LL_1))) + +#define _mm_stream_si128(M128I_PTR_0, M128I_1) \ + (__builtin_ia32_movntdq((__v2di *)(M128I_PTR_0), (__v2di)(M128I_1))) + +#define _mm_clflush(PTR) (__builtin_ia32_clflush((void const *)(PTR))) + +#define _mm_lfence() (__builtin_ia32_lfence()) + +#define _mm_mfence() (__builtin_ia32_mfence()) + +#define _mm_cvtsi64_si128(LL_0) (_mm_set_epi64x(0, (LL_0))) + +#define _mm_cvtsi64x_si128(LL_0) (_mm_set_epi64x(0, (LL_0))) + +#define _mm_castsi128_pd(M128I_0) ((__m128d)(M128I_0)) + #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ #endif /* COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ */ diff --git a/libc/intrin/pmmintrin.internal.h b/libc/intrin/pmmintrin.internal.h index 715bb92ae..6b4cd17e2 100644 --- a/libc/intrin/pmmintrin.internal.h +++ b/libc/intrin/pmmintrin.internal.h @@ -1,14 +1,42 @@ #ifndef COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ #define COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ +#include "libc/intrin/emmintrin.internal.h" #if !(__ASSEMBLER__ + __LINKER__ + 0) /*───────────────────────────────────────────────────────────────────────────│─╗ │ cosmopolitan § it's a trap! » sse3 ─╬─│┼ ╚────────────────────────────────────────────────────────────────────────────│*/ -#define _mm_hadd_ps(M128_0, M128_1) \ - ((__m128)__builtin_ia32_haddps((__v4sf)(__m128)(M128_0), \ - (__v4sf)(__m128)(M128_0))) +#define _mm_addsub_ps(M128_0, M128_1) \ + ((__m128)__builtin_ia32_addsubps((__v4sf)(M128_0), (__v4sf)(M128_1))) + +#define _mm_hadd_ps(M128_0, M128_1) \ + ((__m128)__builtin_ia32_haddps((__v4sf)(M128_0), (__v4sf)(M128_1))) + +#define _mm_hsub_ps(M128_0, M128_1) \ + ((__m128)__builtin_ia32_hsubps((__v4sf)(M128_0), (__v4sf)(M128_1))) + +#define _mm_movehdup_ps(M128_0) \ + ((__m128)__builtin_ia32_movshdup((__v4sf)(M128_0))) + +#define _mm_moveldup_ps(M128_0) \ + ((__m128)__builtin_ia32_movsldup((__v4sf)(M128_0))) + +#define _mm_addsub_pd(M128D_0, M128D_1) \ + ((__m128d)__builtin_ia32_addsubpd((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_hadd_pd(M128D_0, M128D_1) \ + ((__m128d)__builtin_ia32_haddpd((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_hsub_pd(M128D_0, M128D_1) \ + ((__m128d)__builtin_ia32_hsubpd((__v2df)(M128D_0), (__v2df)(M128D_1))) + +#define _mm_movedup_pd(M128D_0) (_mm_shuffle_pd((M128D_0), (M128D_0), 0)) + +#define _mm_loaddup_pd(CONSTDBL_PTR) (_mm_load1_pd((CONSTDBL_PTR))) + +#define _mm_lddqu_si128(M128I_PTR0) \ + ((__m128i)__builtin_ia32_lddqu((char const *)(M128I_PTR0))) #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ #endif /* COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ */