mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 19:43:32 +00:00
56 lines
2.1 KiB
C
56 lines
2.1 KiB
C
|
#ifndef COSMOPOLITAN_LIBC_INTRIN_PALIGNR_H_
|
|||
|
#define COSMOPOLITAN_LIBC_INTRIN_PALIGNR_H_
|
|||
|
#include "libc/assert.h"
|
|||
|
#include "libc/intrin/macros.h"
|
|||
|
#include "libc/macros.h"
|
|||
|
#include "libc/str/str.h"
|
|||
|
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
|||
|
|
|||
|
void pvalignr(void *, const void *, const void *, size_t);
|
|||
|
|
|||
|
/**
|
|||
|
* Overlaps vectors.
|
|||
|
*
|
|||
|
* 𝑖= 0 means 𝑐←𝑎
|
|||
|
* 0<𝑖<16 means 𝑐←𝑎║𝑏
|
|||
|
* 𝑖=16 means 𝑐←𝑏
|
|||
|
* 16<𝑖<32 means 𝑐←𝑏║0
|
|||
|
* 𝑖≥32 means 𝑐←0
|
|||
|
*
|
|||
|
* @param 𝑖 needs to be a literal, constexpr, or embedding
|
|||
|
* @see pvalignr()
|
|||
|
* @mayalias
|
|||
|
*/
|
|||
|
static void palignr(void *c, const void *b, const void *a, size_t i) {
|
|||
|
char t[48];
|
|||
|
memcpy(t, a, 16);
|
|||
|
memcpy(t + 16, b, 16);
|
|||
|
memset(t + 32, 0, 16);
|
|||
|
memcpy(c, t + MIN(32, i), 16);
|
|||
|
}
|
|||
|
|
|||
|
#ifndef __STRICT_ANSI__
|
|||
|
#define palignr(C, B, A, I) \
|
|||
|
do { \
|
|||
|
if (!IsModeDbg() && X86_NEED(SSE) && X86_HAVE(SSSE3)) { \
|
|||
|
__intrin_xmm_t *Xmm0 = (void *)(C); \
|
|||
|
const __intrin_xmm_t *Xmm1 = (const __intrin_xmm_t *)(B); \
|
|||
|
const __intrin_xmm_t *Xmm2 = (const __intrin_xmm_t *)(A); \
|
|||
|
if (!X86_NEED(AVX)) { \
|
|||
|
asm("palignr\t%2,%1,%0" \
|
|||
|
: "=x"(*Xmm0) \
|
|||
|
: "x"(*Xmm2), "i"(I), "0"(*Xmm1)); \
|
|||
|
} else { \
|
|||
|
asm("vpalignr\t%3,%2,%1,%0" \
|
|||
|
: "=x"(*Xmm0) \
|
|||
|
: "x"(*Xmm1), "x"(*Xmm2), "i"(I)); \
|
|||
|
} \
|
|||
|
} else { \
|
|||
|
palignr(C, B, A, I); \
|
|||
|
} \
|
|||
|
} while (0)
|
|||
|
#endif
|
|||
|
|
|||
|
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
|||
|
#endif /* COSMOPOLITAN_LIBC_INTRIN_PALIGNR_H_ */
|